diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
new file mode 100644
index 000000000..c0a22ebf2
--- /dev/null
+++ b/.github/workflows/cmake.yml
@@ -0,0 +1,86 @@
+# Workflow with cmake build system
+name: Short-Tests-CMake
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events but only for the mydev branch
+  push:
+    branches-ignore:
+      - "gh-readonly-queue**"
+  merge_group:
+  pull_request:
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  build-TITANV:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: TITANV
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests-cmake.sh
+
+  build-TITANV-LOCALXBAR:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: TITANV-LOCALXBAR
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests-cmake.sh
+
+  build-QV100:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: QV100
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests-cmake.sh
+
+  build-2060:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: RTX2060
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests-cmake.sh
+
+  build-3070:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: RTX3070
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests-cmake.sh
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 000000000..8e0ae2324
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,105 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Short-Tests
+
+# Controls when the workflow will run
+on:
+  push:
+    branches-ignore:
+      - "gh-readonly-queue**"
+  merge_group:
+  pull_request:
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  build-TITANV:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: TITANV
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
+
+  build-TITANV-LOCALXBAR:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: TITANV-LOCALXBAR
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
+
+  build-QV100:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: QV100
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
+
+  build-2060:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: RTX2060
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
+
+  build-3070:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: RTX3070
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
+  format-code:
+    if: github.event_name == 'pull_request'
+    runs-on: tgrogers-raid
+    needs: [build-TITANV, build-TITANV-LOCALXBAR, build-QV100, build-2060, build-3070]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{github.event.pull_request.head.ref}}
+          repository: ${{github.event.pull_request.head.repo.full_name}}
+          ssh-key: ''
+
+      - name: Run clang-format
+        run: |
+          git config user.name "purdue-jenkins"
+          git config user.email "purdue-jenkins@users.noreply.github.com"
+          git remote set-url origin git@github.com:${{github.event.pull_request.head.repo.full_name}}
+          git remote -v
+          /bin/bash ./format-code.sh
+          if git status --untracked-files=no | grep -q "nothing to commit"; then echo "No changes to commit."; else git commit -a -m "Automated Format"; git push; fi
diff --git a/.github/workflows/sst_integration.yml b/.github/workflows/sst_integration.yml
new file mode 100644
index 000000000..03635db64
--- /dev/null
+++ b/.github/workflows/sst_integration.yml
@@ -0,0 +1,80 @@
+# Workflow with cmake build system
+name: SST Integration Test
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events but only for the mydev branch
+  push:
+    branches-ignore:
+      - "gh-readonly-queue**"
+  pull_request:
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  build-QV100:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        # test_type: [simple, medium, long]
+        test_type: [simple, medium]
+    container:
+      image: tgrogers/accel-sim_regress:SST-Integration-Ubuntu-22.04-cuda-11.7-llvm-18.1.8-riscv-gnu-2024.08.06-nightly
+      env:
+        CONFIG: QV100
+        GPU_ARCH: sm_70
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Build GPGPU-Sim in SST mode
+        run: |
+          source ./setup_environment
+          make -j4
+      - name: Prepare SST dependencies
+        run: |
+          apt install -y openmpi-bin openmpi-common libtool libtool-bin autoconf python3 python3-dev automake build-essential git
+      # Use personal repo for now
+      - name: Build SST-Core
+        run: |
+          git clone https://github.com/William-An/sst-core.git
+          cd sst-core
+          git pull
+          git checkout devel
+          ./autogen.sh
+          ./configure --prefix=`realpath ../sstcore-install` --disable-mpi --disable-mem-pools
+          make -j4
+          make install
+          cd ..
+          rm -rf ./sst-core
+      # Use personal repo for now
+      - name: Build SST-Elements
+        run: |
+          git clone https://github.com/William-An/sst-elements.git
+          source ./setup_environment
+          cd sst-elements
+          git pull
+          git checkout balar-mmio-vanadis-llvm
+          ./autogen.sh
+          ./configure --prefix=`realpath ../sstelements-install` --with-sst-core=`realpath ../sstcore-install` --with-cuda=$CUDA_INSTALL_PATH --with-gpgpusim=$GPGPUSIM_ROOT
+          make -j4
+          make install
+      # Have to resource the gpu app
+      # Also fake a SDK since rodinia 2.0 does not need this, speed things up on github
+      - name: Balar Test
+        run: |
+          pip install testtools blessings pygments
+          source ./setup_environment
+          mkdir 4.2
+          mkdir fake_sdk
+          export NVIDIA_COMPUTE_SDK_LOCATION=$(readlink -f ./fake_sdk)
+          source $GPUAPPS_ROOT/src/setup_environment sst
+          rm -rf 4.2
+          rm -f gpucomputingsdk_4.2.9_linux.run
+          ./sstcore-install/bin/sst-test-elements -w "*balar*${{ matrix.test_type }}*"
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 4b343c557..340277af8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,3 +61,4 @@ debug_tools/WatchYourStep/ptxjitplus/*.ptx
 accel-sim-framework/
 gpu-app-collection/
 
+setup
diff --git a/CHANGES b/CHANGES
index 0c48a3dc0..5d1cd1082 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,4 +1,20 @@
 LOG:
+Version 4.2.0 vs 4.1.0 
+- Added AccelWattch power model v1.0 which replaces GPUWattch. 
+- Added AccelWattch XML configuration files for SM7_QV100, SM7_TITANV, SM75_RTX2060_S, SM6_TITANX. Note that all these AccelWattch XML configuration files are tuned only for SM7_QV100. 
+
+Version 4.1.0 versus 4.0.0
+-Features:
+1- Supporting L1 write-allocate with sub-sector writing policy as in Volta+ hardware, and changing the Volta+ cards config to make L1 write-allocate with write-through
+2- Making the L1 adaptive cache policy to be configurable 
+3- Adding Ampere RTX 3060 config files
+-Bugs:
+1- Fixing L1 bank hash function bug
+2- Fixing L1 read hit counters in gpgpu-sim to match nvprof, to achieve more accurate L1 correlation with the HW
+3- Fixing bugs in lazy write handling, thanks to Gwendolyn Voskuilen from Sandia labs for this fix
+4- Fixing the backend pipeline for sub_core model 
+5- Fixing Memory stomp bug at the shader_config
+6- Some code refactoring:
 Version 4.0.0 (development branch) versus 3.2.3
 -Front-End:
 1- Support .nc cache modifier and __ldg function to access the read-only L1D cache
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..95ca8e085
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,167 @@
+cmake_minimum_required(VERSION 3.17)
+
+# Project name and version
+project(GPGPU-Sim 
+        VERSION 4.2.0 
+        DESCRIPTION "cycle-level simulator modeling contemporary graphics processing units (GPUs)" 
+        HOMEPAGE_URL https://github.com/accel-sim/gpgpu-sim_distribution
+        LANGUAGES CXX)
+
+# Specify the C++ standard
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# GPGPU-Sim build option
+option(GPGPUSIM_ENABLE_TRACE "Whether to enable GPGPU-Sim debug tracing" ON)
+
+# GPGPU-Sim conditional build variable
+set(GPGPUSIM_USE_POWER_MODEL OFF)
+set(GPGPUSIM_USE_OPENCL OFF)
+
+# Check for dependencies
+include(gpgpusim_check.cmake)
+
+# Create version file
+add_custom_target(gen_build_string ALL 
+                    COMMAND ${CMAKE_COMMAND} -D INPUT_DIR=${CMAKE_CURRENT_SOURCE_DIR} -D OUTPUT_DIR=${CMAKE_BINARY_DIR} -P ${CMAKE_CURRENT_SOURCE_DIR}/gpgpusim_gen_build_string.cmake
+                    COMMENT "Generating build string file to ${CMAKE_CURRENT_BINARY_DIR}")
+
+# CMake target
+# GPGPU-Sim CUDA Runtime lib
+# Use the entrypoint object files sources else CMake will complain
+add_library(cudart SHARED $<TARGET_OBJECTS:gpgpusim_entrypoint>)
+add_library(entrypoint STATIC $<TARGET_OBJECTS:gpgpusim_entrypoint>)
+
+# Add global C/CXX compilation flags and definitions
+# TODO Specify more build modes like gem5 with fast opt?
+if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    add_compile_definitions(DEBUG=1)
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-Wall;-Wno-unused-function;-Wno-sign-compare;-g;-fPIC>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:-Wall;-Wno-unused-function;-Wno-sign-compare;-ggdb;-fPIC>")
+else()
+    add_compile_definitions(DEBUG=0)
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-O3;-g;-Wall;-Wno-unused-function;-Wno-sign-compare;-fPIC>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:-Wall;-Wno-unused-function;-Wno-sign-compare;-fPIC>")
+endif()
+
+# Add CUDA version
+add_compile_definitions(CUDART_VERSION=${CUDA_VERSION_NUMBER})
+
+# OpenCL support
+if(GPGPUSIM_USE_OPENCL)
+    add_compile_definitions(OPENGL_SUPPORT)
+endif()
+
+# Tracing support
+if(GPGPUSIM_ENABLE_TRACE)
+    add_compile_definitions(TRACING_ON=1)
+endif()
+
+# Add subdirectory
+add_subdirectory(src)
+add_subdirectory(libcuda)
+add_subdirectory(libopencl)
+
+# Set linker option for libcudart.so
+if(APPLE)
+    target_link_options(cudart PUBLIC "-Wl,-headerpad_max_install_names,-undefined,dynamic_lookup,-compatibility_version,1.1,-current_version,1.1;-lm;-lz;-pthread")
+else()
+    target_link_options(cudart PUBLIC
+        "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/linux-so-version.txt;-lm;-lz;-lGL;-pthread")
+        target_link_options(entrypoint PUBLIC
+        "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/linux-so-version.txt;-lm;-lz;-lGL;-pthread")
+endif()
+# cuda: CUDA API lib
+# ptxsim: cuda-sim, functional simulator
+# gpgpusim: gpu simulator (gpgpu-sim)
+# intersim: interconnect simulator
+# accelwattch: power simulator
+# Rest of source files in src/ will be created with gpgpusim_entrypoint target
+target_link_libraries(cudart PUBLIC cuda ptxsim gpgpusim intersim)
+target_link_libraries(entrypoint PUBLIC cuda ptxsim gpgpusim intersim)
+if(GPGPUSIM_USE_POWER_MODEL)
+target_link_libraries(cudart PUBLIC cuda ptxsim gpgpusim intersim accelwattch)
+target_link_libraries(entrypoint PUBLIC cuda ptxsim gpgpusim intersim accelwattch)
+endif()
+
+# TODO Conditionally build for Opencl?
+# if(GPGPUSIM_USE_OPENCL)
+# add_library(OpenCL)
+# endif()
+
+# Install and post-install
+# Get configure
+set(GPGPUSIM_CONFIG "gcc-${CMAKE_CXX_COMPILER_VERSION}/cuda-${CUDA_VERSION_NUMBER}/${GPGPUSIM_BUILD_MODE}")
+
+# Env var setup script
+include(gpgpusim_gen_setup_environment.cmake)
+
+# Installation
+set(GPGPUSIM_INSTALL_PATH ${PROJECT_SOURCE_DIR}/lib/${GPGPUSIM_CONFIG})
+install(TARGETS cudart DESTINATION ${GPGPUSIM_INSTALL_PATH})
+
+# Installing symlinks
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.2\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.3\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.4\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.5.0\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.5.5\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.6.0\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.6.5\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.7.0\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.7.5\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.8.0\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.9.0\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.9.1\)")
+    install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.9.2\)")
+    install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.10.0\)")
+    install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.10.1\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.11.0\)")
\ No newline at end of file
diff --git a/COPYRIGHT b/COPYRIGHT
index a4eea2915..1c949f93e 100644
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -44,3 +44,33 @@ per UBC policy 88, item 2.3 on literary works) these students names appear in
 the copyright notices of the respective files. UBC is also mentioned in the 
 copyright notice to highlight that was the author's affiliation when the work 
 was performed.
+
+NOTE 3: AccelWattch and all its components are covered by the following license and copyright.
+Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/Jenkinsfile b/Jenkinsfile
index a3db3e503..4ef467bae 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -87,23 +87,55 @@ pipeline {
                         ssh tgrogers@dynamo.ecn.purdue.edu "cd $PLOTDIR && rm -rf latest && cp -r ${BUILD_NUMBER} latest"'
             }
         }
+        stage('sst-core-build') {
+            steps {
+                sh 'rm -rf sstcore-install'
+                sh 'rm -rf sst-core && git clone git@github.com:sstsimulator/sst-core.git'
+                sh '''#!/bin/bash
+                    cd sst-core
+                    ./autogen.sh
+                    ./configure --prefix=`realpath ../sstcore-install` --disable-mpi --disable-mem-pools
+                    make -j 10 
+                    make install'''
+            }
+        }
+        stage('sst-elements-build') {
+            steps {
+                sh 'rm -rf sstelements-install'
+                sh 'rm -rf sst-elements && git clone git@github.com:sstsimulator/sst-elements.git'
+                // First sourcing the env_setup and setup_environment script for env vars
+                sh '''#!/bin/bash
+                    source ./env-setup/11.0_env_setup.sh
+                    source `pwd`/setup_environment
+                    cd sst-elements
+                    ./autogen.sh
+                    ./configure --prefix=`realpath ../sstelements-install` --with-sst-core=`realpath ../sstcore-install` --with-cuda=$CUDA_INSTALL_PATH --with-gpgpusim=$GPGPUSIM_ROOT
+                    make -j 10 
+                    make install'''
+            }
+        }
+        stage('sst balar test') {
+            steps {
+                sh '''#!/bin/bash
+                    source ./env-setup/11.0_env_setup.sh
+                    source `pwd`/setup_environment sst
+                    ./sstcore-install/bin/sst-test-elements -p ./sst-elements/src/sst/elements/balar/tests'''
+            }
+        }
     }
     post {
         success {
-//            sh 'git remote rm upstream'
-            emailext body:'''${SCRIPT, template="groovy-html.success.template"}''',
+            emailext body: "See ${BUILD_URL}.",
                 recipientProviders: [[$class: 'CulpritsRecipientProvider'],
                     [$class: 'RequesterRecipientProvider']],
-                subject: "[AALP Jenkins] Build #${BUILD_NUMBER} - Success!",
-                attachmentsPattern: 'correl.*.txt',
+                subject: "[AALP Jenkins] Build ${JOB_NAME} #${BUILD_NUMBER} - Success!",
                 to: 'tgrogers@purdue.edu'
         }
         failure {
-//            sh 'git remote rm upstream'
             emailext body: "See ${BUILD_URL}",
                 recipientProviders: [[$class: 'CulpritsRecipientProvider'],
                     [$class: 'RequesterRecipientProvider']],
-                subject: "[AALP Jenkins] Build #${BUILD_NUMBER} - ${currentBuild.result}",
+                subject: "[AALP Jenkins] Build ${JOB_NAME} #${BUILD_NUMBER} - ${currentBuild.result}",
                 to: 'tgrogers@purdue.edu'
         }
     }
diff --git a/Makefile b/Makefile
index d248211cd..37dba0146 100644
--- a/Makefile
+++ b/Makefile
@@ -34,6 +34,7 @@ INTERSIM ?= intersim2
 
 include version_detection.mk
 
+# Check for debug
 ifeq ($(GPGPUSIM_CONFIG), gcc-$(CC_VERSION)/cuda-$(CUDART_VERSION)/debug)
 	export DEBUG=1
 else
@@ -87,7 +88,7 @@ ifneq ($(GPGPUSIM_POWER_MODEL),)
 		MCPAT_DBG_FLAG = dbg
 	endif
 
-	MCPAT_OBJ_DIR = $(SIM_OBJ_FILES_DIR)/gpuwattch
+	MCPAT_OBJ_DIR = $(SIM_OBJ_FILES_DIR)/accelwattch
 
 	MCPAT = $(MCPAT_OBJ_DIR)/*.o
 endif
@@ -117,24 +118,24 @@ check_setup_environment:
 	 fi 
 
 check_power:
-	@if [ -d "$(GPGPUSIM_ROOT)/src/gpuwattch/" -a ! -n "$(GPGPUSIM_POWER_MODEL)" ]; then \
+	@if [ -d "$(GPGPUSIM_ROOT)/src/accelwattch/" -a ! -n "$(GPGPUSIM_POWER_MODEL)" ]; then \
 		echo ""; \
-		echo "	Power model detected in default directory ($(GPGPUSIM_ROOT)/src/gpuwattch) but GPGPUSIM_POWER_MODEL not set."; \
-		echo "	Please re-run setup_environment or manually set GPGPUSIM_POWER_MODEL to the gpuwattch directory if you would like to include the GPGPU-Sim Power Model."; \
+		echo "	Power model detected in default directory ($(GPGPUSIM_ROOT)/src/accelwattch) but GPGPUSIM_POWER_MODEL not set."; \
+		echo "	Please re-run setup_environment or manually set GPGPUSIM_POWER_MODEL to the accelwattch directory if you would like to include the GPGPU-Sim Power Model."; \
 		echo ""; \
 		true; \
 	elif [ ! -d "$(GPGPUSIM_POWER_MODEL)" ]; then \
 		echo ""; \
 		echo "ERROR ** Power model directory invalid."; \
 		echo "($(GPGPUSIM_POWER_MODEL)) is not a valid directory."; \
-		echo "Please set GPGPUSIM_POWER_MODEL to the GPGPU-Sim gpuwattch directory."; \
+		echo "Please set GPGPUSIM_POWER_MODEL to the GPGPU-Sim accelwattch directory."; \
 		echo ""; \
 		exit 101; \
 	elif [ -n "$(GPGPUSIM_POWER_MODEL)" -a ! -f "$(GPGPUSIM_POWER_MODEL)/gpgpu_sim.verify" ]; then \
 		echo ""; \
 		echo "ERROR ** Power model directory invalid."; \
 		echo "gpgpu_sim.verify not found in $(GPGPUSIM_POWER_MODEL)."; \
-		echo "Please ensure that GPGPUSIM_POWER_MODEL points to a valid gpuwattch directory and that you have the correct GPGPU-Sim mcpat distribution."; \
+		echo "Please ensure that GPGPUSIM_POWER_MODEL points to a valid accelwattch directory and that you have the correct GPGPU-Sim mcpat distribution."; \
 		echo ""; \
 		exit 102; \
 	fi
@@ -168,6 +169,7 @@ $(SIM_LIB_DIR)/libcudart.so: makedirs $(LIBS) cudalib
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.10.0 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.10.0; fi
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.10.1 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.10.1; fi
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.11.0 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.11.0; fi
+	if [ ! -f $(SIM_LIB_DIR)/libcudart_mod.so ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart_mod.so; fi
 
 $(SIM_LIB_DIR)/libcudart.dylib: makedirs $(LIBS) cudalib
 	g++ -dynamiclib -Wl,-headerpad_max_install_names,-undefined,dynamic_lookup,-compatibility_version,1.1,-current_version,1.1\
@@ -243,8 +245,8 @@ makedirs:
 	if [ ! -d $(SIM_OBJ_FILES_DIR)/libopencl/bin ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/libopencl/bin; fi;
 	if [ ! -d $(SIM_OBJ_FILES_DIR)/$(INTERSIM) ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/$(INTERSIM); fi;
 	if [ ! -d $(SIM_OBJ_FILES_DIR)/cuobjdump_to_ptxplus ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/cuobjdump_to_ptxplus; fi;
-	if [ ! -d $(SIM_OBJ_FILES_DIR)/gpuwattch ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/gpuwattch; fi;
-	if [ ! -d $(SIM_OBJ_FILES_DIR)/gpuwattch/cacti ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/gpuwattch/cacti; fi;
+	if [ ! -d $(SIM_OBJ_FILES_DIR)/accelwattch ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/accelwattch; fi;
+	if [ ! -d $(SIM_OBJ_FILES_DIR)/accelwattch/cacti ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/accelwattch/cacti; fi;
 
 all:
 	$(MAKE) gpgpusim
diff --git a/README.md b/README.md
index 9f9f6698f..d0b14cdd0 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 Welcome to GPGPU-Sim, a cycle-level simulator modeling contemporary graphics
 processing units (GPUs) running GPU computing workloads written in CUDA or
 OpenCL. Also included in GPGPU-Sim is a performance visualization tool called
-AerialVision and a configurable and extensible energy model called GPUWattch.
-GPGPU-Sim and GPUWattch have been rigorously validated with performance and
+AerialVision and a configurable and extensible power model called AccelWattch.
+GPGPU-Sim and AccelWattch have been rigorously validated with performance and
 power measurements of real hardware GPUs.
 
 This version of GPGPU-Sim has been tested with a subset of CUDA version 4.2,
@@ -11,6 +11,11 @@ This version of GPGPU-Sim has been tested with a subset of CUDA version 4.2,
 Please see the copyright notice in the file COPYRIGHT distributed with this
 release in the same directory as this file.
 
+GPGPU-Sim 4.0 is compatible with Accel-Sim simulation framework. With the support 
+of Accel-Sim, GPGPU-Sim 4.0 can run NVIDIA SASS traces (trace-based simulation) 
+generated by NVIDIA's dynamic binary instrumentation tool (NVBit). For more information 
+about Accel-Sim, see [https://accel-sim.github.io/](https://accel-sim.github.io/)
+
 If you use GPGPU-Sim 4.0 in your research, please cite:
 
 Mahmoud Khairy, Zhesheng Shen, Tor M. Aamodt, Timothy G Rogers.
@@ -18,7 +23,7 @@ Accel-Sim: An Extensible Simulation Framework for Validated GPU Modeling.
 In proceedings of the 47th IEEE/ACM International Symposium on Computer Architecture (ISCA),
 May 29 - June 3, 2020.
 
-If you use CuDNN or PyTorch support, checkpointing or our new debugging tool for functional 
+If you use CuDNN or PyTorch support (execution-driven simulation), checkpointing or our new debugging tool for functional 
 simulation errors in GPGPU-Sim for your research, please cite:
 
 Jonathan Lew, Deval Shah, Suchita Pati, Shaylin Cattell, Mengchi Zhang, Amruth Sandhupatla, 
@@ -26,7 +31,6 @@ Christopher Ng, Negar Goli, Matthew D. Sinclair, Timothy G. Rogers, Tor M. Aamod
 Analyzing Machine Learning Workloads Using a Detailed GPU Simulator, arXiv:1811.08933,
 https://arxiv.org/abs/1811.08933
 
-
 If you use the Tensor Core model in GPGPU-Sim or GPGPU-Sim's CUTLASS Library 
 for your research please cite:
 
@@ -34,12 +38,11 @@ Md Aamir Raihan, Negar Goli, Tor Aamodt,
 Modeling Deep Learning Accelerator Enabled GPUs, arXiv:1811.08309, 
 https://arxiv.org/abs/1811.08309
 
-If you use the GPUWattch energy model in your research, please cite:
+If you use the AccelWattch power model in your research, please cite:
 
-Jingwen Leng, Tayler Hetherington, Ahmed ElTantawy, Syed Gilani, Nam Sung Kim,
-Tor M. Aamodt, Vijay Janapa Reddi, GPUWattch: Enabling Energy Optimizations in
-GPGPUs, In proceedings of the ACM/IEEE International Symposium on Computer
-Architecture (ISCA 2013), Tel-Aviv, Israel, June 23-27, 2013.
+Vijay Kandiah, Scott Peverelle, Mahmoud Khairy, Junrui Pan, Amogh Manjunath, Timothy G. Rogers, Tor M. Aamodt, and Nikos Hardavellas. 2021.
+AccelWattch: A Power Modeling Framework for Modern GPUs. In MICRO54: 54th Annual IEEE/ACM International Symposium on Microarchitecture
+(MICRO ’21), October 18–22, 2021, Virtual Event, Greece.
 
 If you use the support for CUDA dynamic parallelism in your research, please cite:
 
@@ -58,8 +61,8 @@ This file contains instructions on installing, building and running GPGPU-Sim.
 Detailed documentation on what GPGPU-Sim models, how to configure it, and a
 guide to the source code can be found here: <http://gpgpu-sim.org/manual/>.
 Instructions for building doxygen source code documentation are included below.
-Detailed documentation on GPUWattch including how to configure it and a guide
-to the source code can be found here: <http://gpgpu-sim.org/gpuwattch/>.
+
+Previous versions of GPGPU-Sim (3.2.0 to 4.1.0) included the [GPUWattch Energy model](http://gpgpu-sim.org/gpuwattch/) which has been replaced by AccelWattch version 1.0 in GPGPU-Sim version 4.2.0. AccelWattch supports modern GPUs and is validated against a NVIDIA Volta QV100 GPU. Detailed documentation on AccelWattch can be found here: [AccelWattch Overview](https://github.com/VijayKandiah/accel-sim-framework#accelwattch-overview) and [AccelWattch MICRO'21 Artifact Manual](https://github.com/VijayKandiah/accel-sim-framework/blob/release/AccelWattch.md).
 
 If you have questions, please sign up for the google groups page (see
 gpgpu-sim.org), but note that use of this simulator does not imply any level of
@@ -104,21 +107,20 @@ library (part of the CUDA toolkit). Code to interface with the CUDA Math
 library is contained in cuda-math.h, which also includes several structures
 derived from vector_types.h (one of the CUDA header files).
 
-## GPUWattch Energy Model
+## AccelWattch Power Model
 
-GPUWattch (introduced in GPGPU-Sim 3.2.0) was developed by researchers at the
-University of British Columbia, the University of Texas at Austin, and the
-University of Wisconsin-Madison. Contributors to GPUWattch include Tor
-Aamodt's research group at the University of British Columbia: Tayler
-Hetherington and Ahmed ElTantawy; Vijay Reddi's research group at the
-University of Texas at Austin: Jingwen Leng; and Nam Sung Kim's research group
-at the University of Wisconsin-Madison: Syed Gilani.
+AccelWattch (introduced in GPGPU-Sim 4.2.0) was developed by researchers at 
+Northwestern University, Purdue University, and the University of British Columbia. 
+Contributors to AccelWattch include Nikos Hardavellas's research group at Northwestern University: 
+Vijay Kandiah; Tor Aamodt's research group at the University of British Columbia: Scott Peverelle; 
+and Timothy Rogers's research group at Purdue University: Mahmoud Khairy, Junrui Pan, and Amogh Manjunath. 
 
-GPUWattch leverages McPAT, which was developed by Sheng Li et al. at the
+AccelWattch leverages McPAT, which was developed by Sheng Li et al. at the
 University of Notre Dame, Hewlett-Packard Labs, Seoul National University, and
-the University of California, San Diego. The paper can be found at
+the University of California, San Diego. The McPAT paper can be found at
 http://www.hpl.hp.com/research/mcpat/micro09.pdf.
 
+
 # INSTALLING, BUILDING and RUNNING GPGPU-Sim
 
 Assuming all dependencies required by GPGPU-Sim are installed on your system,
@@ -261,9 +263,10 @@ To clean the docs run
 The documentation resides at doc/doxygen/html.
 
 To run Pytorch applications with the simulator, install the modified Pytorch library as well by following instructions [here](https://github.com/gpgpu-sim/pytorch-gpgpu-sim).
+
 ## Step 3: Run
 
-Before we run, we need to make sure the application's executable file is dynamically linked to CUDA runtime library. This can be done during compilation of your program by introducing the nvcc flag "--cudart shared" in makefile (quotes should be excluded).
+Before we run, we need to make sure the application's executable file is dynamically linked to CUDA runtime library. This can be done during compilation of your program by introducing the nvcc flag "-lcudart" in makefile (quotes should be excluded).
 
 To confirm the same, type the follwoing command:
 
@@ -311,15 +314,16 @@ need to re-compile your application simply to run it on GPGPU-Sim.
 To revert back to running on the hardware, remove GPGPU-Sim from your
 LD_LIBRARY_PATH environment variable.
 
-The following GPGPU-Sim configuration options are used to enable GPUWattch
+The following GPGPU-Sim configuration options are used to enable AccelWattch
 
 	-power_simulation_enabled 1 (1=Enabled, 0=Not enabled)
-	-gpuwattch_xml_file <filename>.xml
-
+	-power_simulation_mode 0 (0=AccelWattch_SASS_SIM or AccelWattch_PTX_SIM, 1=AccelWattch_SASS_HW, 2=AccelWattch_SASS_HYBRID)
+	-accelwattch_xml_file <filename>.xml
 
-The GPUWattch XML configuration file name is set to gpuwattch.xml by default and
-currently only supplied for GTX480 (default=gpuwattch_gtx480.xml). Please refer to
-<http://gpgpu-sim.org/gpuwattch/> for more information.
+The AccelWattch XML configuration file name is set to accelwattch_sass_sim.xml by default and is
+currently provided for SM7_QV100, SM7_TITANV, SM75_RTX2060_S, and SM6_TITANX. 
+Note that all these AccelWattch XML configuration files are tuned only for SM7_QV100. Please refer to
+<https://github.com/VijayKandiah/accel-sim-framework#accelwattch-overview> for more information.
 
 Running OpenCL applications is identical to running CUDA applications. However,
 OpenCL applications need to communicate with the NVIDIA driver in order to
diff --git a/aerialvision/configs.py b/aerialvision/configs.py
index f0389ac20..01dba2e83 100644
--- a/aerialvision/configs.py
+++ b/aerialvision/configs.py
@@ -61,7 +61,7 @@
 # Vancouver, BC V6T 1Z4
 
 
-import ConfigParser, os
+import configparser, os
 
 userSettingPath = os.path.join(os.environ['HOME'], '.gpgpu_sim', 'aerialvision')
 
@@ -69,14 +69,14 @@
 class AerialVisionConfig:
 
     def __init__(self):
-        self.config = ConfigParser.SafeConfigParser()
+        self.config = configparser.SafeConfigParser()
         self.config.read( os.path.join(userSettingPath, 'config.rc') )
 
     def print_all(self):
         for section in self.config.sections():
             for option in self.config.options(section):
                 value = self.config.get(section, option)
-                print "\t%s.%s = %s" % (section, option, value);
+                print("\t%s.%s = %s" % (section, option, value));
 
     def get_value(self, section, option, default):
         if (self.config.has_option(section, option)):
@@ -90,10 +90,11 @@ def get_value(self, section, option, default):
 
 #Unit test / configviewer
 def main():
-    print "AerialVision Options:"
+    print("AerialVision Options:")
     avconfig.print_all()
-    print "";
+    print("");
 
 if __name__ == "__main__":
     main()
 
+
diff --git a/aerialvision/guiclasses.py b/aerialvision/guiclasses.py
index 04036a8a3..f4ecd2938 100644
--- a/aerialvision/guiclasses.py
+++ b/aerialvision/guiclasses.py
@@ -64,10 +64,10 @@
 import time
 import os
 import array
-import Tkinter as Tk
+import tkinter as Tk
 import matplotlib
 matplotlib.use('TkAgg')
-from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2TkAgg
+from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk
 from matplotlib.figure import Figure
 import matplotlib as mpl
 from matplotlib.colors import colorConverter
@@ -250,19 +250,19 @@ def chooseFile(self, *event):
     self.cXAxisData.delete(0, Tk.END)
     self.cYAxisData.delete(0, Tk.END)
     
-    
+      
     #filling in xAxis vars
-    for keys in self.data[self.fileChosen].keys():
+    for keys in list(self.data[self.fileChosen].keys()):
         if keys == 'globalCycle':
             self.cXAxisData.insert(Tk.END, keys)
             
     #filling in yAxis vars
     #Need to fill up list alphabetically
     keysAlpha = []
-    for key in self.data[self.fileChosen].keys():
+    for key in list(self.data[self.fileChosen].keys()):
         if key not in ['globalCycle','CFLOG','EXTVARS']:#exclude hacks from list
             keysAlpha.append(key)
-    keysAlpha.sort(lambda x, y: cmp(x.lower(),y.lower()))
+    #keysAlpha.sort(key=lambda x, y: cmp(x.lower(),y.lower()))
     for keys in keysAlpha:
         self.cYAxisData.insert(Tk.END, keys)
             
@@ -782,7 +782,7 @@ def __init__(self, master, data, res, dataChosen):
         #self.plot = self.figure.add_subplot(111)
         self.canvas = FigureCanvasTkAgg(self.figure, master=self.graphArea)
         self.canvas.get_tk_widget().pack()
-        self.toolbar = NavigationToolbar2TkAgg(self.canvas, self.toolbarArea)
+        self.toolbar = NavigationToolbar2Tk(self.canvas, self.toolbarArea)
         self.toolbar.update()
         self.plotData()
         
@@ -931,7 +931,7 @@ def type1Variable(self, x, xAxis, y, yAxis, boolK, plotID):
         
         
         
-        if self.simplerName.has_key('globalTotInsn') == 'False':
+        if ('globalTotInsn' in self.simplerName) == 'False':
             graphOption = 1
             
         if (graphOption == 1):  
@@ -972,7 +972,7 @@ def type2Variable(self, x, xAxis, y, yAxis, plotID):
 
         graphOption = "NULL"
             
-        if self.simplerName.has_key('globalTotInsn') == 'False':
+        if ('globalTotInsn' in self.simplerName) == 'False':
             graphOption = 1
     
         if (graphOption == 1):
@@ -1018,7 +1018,7 @@ def type3Variable(self, x, xAxis, y, yAxis, plotID):
     
         #if there are kernals.. we need to adjust the x axis for proper labelling
         #Need to make changes here.. works for now though
-        if self.simplerName.has_key('globalTotInsn'):
+        if 'globalTotInsn' in self.simplerName:
             x = self.updateVarKernal(x)
 
         concentrationFactor = len(x) // 512 + 1
@@ -1038,7 +1038,7 @@ def type3Variable(self, x, xAxis, y, yAxis, plotID):
         yoff = numpy.array([0.0] * numCols) #variable use to remember the last top location of a bar so that we may stack the proceeding bar on top of it
         #Legendname = ['UNUSED', 'UNUSED', 'FQPUSHED','ICNT_PUSHED','ICNT_INJECTED','ICNT_AT_DEST','DRAMQ','DRAM_PROCESSING_START','DRAM_PROCESSING_END','DRAM_OUTQ','2SH_ICNT_PUSHED','2SH_ICNT_INJECTED','2SH_ICNT_AT_DEST','2SH_FQ_POP','RETURN_Q']; 
         Legendname = ['N/A', 'N/A','N/A','IcntInpBuf','N/A','Icnt2DRAM','N/A','N/A','N/A','DRAM','2Sh_IcntInpBuf','N/A','Icnt2shd','N/A','N/A']; 
-        BarSequence = range(numRows-1,-1,-1)
+        BarSequence = list(range(numRows-1,-1,-1))
 
         if yAxis == 'WarpDivergenceBreakdown':
             Legendname = []
@@ -1046,21 +1046,21 @@ def type3Variable(self, x, xAxis, y, yAxis, plotID):
             Legendname.append('Data Hazard')
             Legendname.append('Stall')
             for c in range(2, numRows):
-                Legendname.append('W' + `4*(c-2)+1` +  ':' + `4*(c-1)`)
-            BarSequence = range(0,numRows)
+                Legendname.append('W' + repr(4*(c-2)+1) +  ':' + repr(4*(c-1)))
+            BarSequence = list(range(0,numRows))
 
         if yAxis == 'WarpIssueSlotBreakdown':
             Legendname = []
             for c in range(0, numRows):
-                Legendname.append('W' + `c`)
-            BarSequence = range(0,numRows)
+                Legendname.append('W' + repr(c))
+            BarSequence = list(range(0,numRows))
 
         dynamic_warp_resolution = 32
         if yAxis == 'WarpIssueDynamicIdBreakdown':
             Legendname = []
             for c in range(0, numRows):
-                Legendname.append('W' + `dynamic_warp_resolution*c` + ":" + `dynamic_warp_resolution*(c+1)`)
-            BarSequence = range(0,numRows)
+                Legendname.append('W' + repr(dynamic_warp_resolution*c) + ":" + repr(dynamic_warp_resolution*(c+1)))
+            BarSequence = list(range(0,numRows))
 
         yoff_max = numpy.array([0.0] * numCols)
         for row in range(numRows-1,-1,-1):
@@ -1102,10 +1102,10 @@ def type3Variable(self, x, xAxis, y, yAxis, plotID):
         for label in self.plot.get_yticklabels():
             label.set_fontsize(plotFormat.yticksFontSize)
      
-        self.canvas.show()
+        self.canvas.draw()
         
     def type4Variable(self, x, xAxis, y, yAxis, plotID):
-        keys = y.keys()
+        keys = list(y.keys())
         keys.sort()
             
         if (self.dataPointer.graphChosen == self.possGraphs[3]):
@@ -1251,7 +1251,7 @@ def plot2VarLine(self, x, xAxis, y, yAxis):
       self.plot.set_title(self.plotFormatInfo[self.currPlot].title)
       self.plot.set_xlabel(self.plotFormatInfo[self.currPlot].xlabel, fontsize = self.plotFormatInfo[self.currPlot].labelFontSize)
       self.plot.set_ylabel(self.plotFormatInfo[self.currPlot].ylabel, fontsize = self.plotFormatInfo[self.currPlot].labelFontSize)
-      self.canvas.show()
+      self.canvas.draw()
     
     
     def plotMultVarLine(self, x, xAxis, y, yAxis):
@@ -1261,7 +1261,7 @@ def plotMultVarLine(self, x, xAxis, y, yAxis):
       self.plotFormatInfo[self.currPlot].InitLabels(xlabel = xAxis, ylabel = yAxis, cbarlabel = '', title = '')
       self.plot.set_xlabel(self.plotFormatInfo[self.currPlot].xlabel, fontsize = self.plotFormatInfo[self.currPlot].labelFontSize)
       self.plot.set_ylabel(self.plotFormatInfo[self.currPlot].ylabel, fontsize = self.plotFormatInfo[self.currPlot].labelFontSize)
-      self.canvas.show()
+      self.canvas.draw()
 
 
     def plotScatter(self, x, xAxis, y, yAxis, plotID):
@@ -1275,7 +1275,7 @@ def plotScatter(self, x, xAxis, y, yAxis, plotID):
         self.plot.set_title(plotFormat.title, fontsize = plotFormat.labelFontSize)
         self.plot.set_xlabel(plotFormat.xlabel, fontsize = plotFormat.labelFontSize)
         self.plot.set_ylabel(plotFormat.ylabel, fontsize = plotFormat.labelFontSize)
-        self.canvas.show()
+        self.canvas.draw()
       
     
     def takeDerivativeMult(self,x,y):
@@ -1347,12 +1347,12 @@ def plotParallelIntensity(self, x, xAxis, y, yAxis, colorAxis, yTicks, plotID):
         
         # put number on axis if there are more than one ticks 
         if (self.xAxisStepsWilStack[self.currPlot] != 1):
-            for count in range(0,len(x),len(x)/self.xAxisStepsWilStack[self.currPlot]):
+            for count in range(0,len(x),int(len(x)/self.xAxisStepsWilStack[self.currPlot])):
                 xlabelValues.append(x[count])
                 xlabelPos.append(xticksPos[count])
         
-        print self.yAxisStepsWilStack[self.currPlot]
-        for count in range(0,len(y),len(y)/self.yAxisStepsWilStack[self.currPlot]):
+        print(self.yAxisStepsWilStack[self.currPlot])
+        for count in range(0,len(y),int(len(y)/self.yAxisStepsWilStack[self.currPlot])):
             ylabelValues.append(yTicks[count])
             ylabelPos.append(yticksPos[count])            
 
@@ -1387,7 +1387,7 @@ def plotParallelIntensity(self, x, xAxis, y, yAxis, colorAxis, yTicks, plotID):
         xtickStep = x[1] - x[0]
         self.plot.set_xlim(0 / xtickStep - 0.5, self.xlim / xtickStep + 0.5)
 
-        self.canvas.show()
+        self.canvas.draw()
         
     def updateWilTicks(self, z):
         x= []
@@ -1480,7 +1480,7 @@ def changeColorMapMaxMin(self):
                   else:
                       for iter in range(0, self.dataPointer.dydx):
                         if self.simplerName[self.dataPointer.dataChosenY].type == 4:
-                          keys = self.simplerName[self.dataPointer.dataChosenY].data.keys()
+                          keys = list(self.simplerName[self.dataPointer.dataChosenY].data.keys())
                           keys.sort()
                           y = []
                           for iter in keys:
@@ -1523,7 +1523,7 @@ def changeColorMapMaxMin(self):
               entry[self.currPlot] = (maxEntry, minEntry)
         
               cmap = self.plotFormatInfo[self.currPlot].cmap
-              plotCMap = apply(Tk.OptionMenu, (root[-1], cmap) + tuple(PlotFormatInfo.cmapOptions)) 
+              plotCMap = Tk.OptionMenu(*(root[-1], cmap) + tuple(PlotFormatInfo.cmapOptions)) 
               plotCMap.pack(side = Tk.LEFT, padx = 5)
           
 
@@ -1612,7 +1612,7 @@ def collectDataChangeDiv(self, vars,master):
         for self.currPlot in range(1,numPlots + 1):
           self.findKernalLocs()
           
-          if vars.has_key(str(self.currPlot)):
+          if str(self.currPlot) in vars:
               if vars[str(self.currPlot)].get() == 1:
                   self.dataPointer.dydx += 1
 
@@ -1681,12 +1681,12 @@ def collectDataIncreaseYBinning(self, currPlot):
         if (self.yAxisStepsWilStack[plotToIncrease] == 1):
             self.yAxisStepsWilStack[plotToIncrease] = 2
         self.yAxisStepsWilStack[plotToIncrease] = int(float(self.yAxisStepsWilStack[plotToIncrease])*1.50)
-        print self.yAxisStepsWilStack[plotToIncrease]
+        print(self.yAxisStepsWilStack[plotToIncrease])
         self.plotDataForNewBinning(plotToIncrease)
 
     def collectDataDecreaseYBinning(self, currPlot, remove = False):
         plotToDecrease = int(currPlot[0])
-        print self.yAxisStepsWilStack[plotToDecrease]
+        print(self.yAxisStepsWilStack[plotToDecrease])
         if (remove == True):
             self.yAxisStepsWilStack[plotToDecrease] = 1
         else:
@@ -1751,7 +1751,7 @@ def editLabelsButton(self):
           entries[self.currPlot].append(Tk.Entry(root, width = 50))
           entries[self.currPlot][-1].grid(row = currentRow, column = 4, padx = 10)
           entries[self.currPlot][-1].insert(0, self.plot.get_xlabel())
-          if self.colorbars.has_key(self.currPlot):
+          if self.currPlot in self.colorbars:
               plotLabel3 = Tk.Label(root, text = 'Colorbar: ', bg = 'white')
               plotLabel3.grid(row = currentRow, column = 5)
               entries[self.currPlot].append(Tk.Entry(root, width = 20))
@@ -1821,7 +1821,7 @@ def collectDataEditLabels(self, entries, master):
           self.plot.set_ylabel(plotFormat.ylabel, fontsize=plotFormat.labelFontSize)
           plotFormat.xlabel = entries[self.currPlot][1].get()
           self.plot.set_xlabel(plotFormat.xlabel, fontsize=plotFormat.labelFontSize)
-          if self.colorbars.has_key(self.currPlot):
+          if self.currPlot in self.colorbars:
               plotFormat.cbarlabel = entries[self.currPlot][2].get()
               self.colorbars[self.currPlot].set_label(plotFormat.cbarlabel, fontsize=plotFormat.labelFontSize)
           else:
@@ -1841,7 +1841,7 @@ def collectDataEditLabels(self, entries, master):
               ytickslabels[n].set_fontsize(plotFormat.yticksFontSize)
 
           # change colorbar ticks label fontsize
-          if self.colorbars.has_key(self.currPlot):
+          if self.currPlot in self.colorbars:
               for label in self.colorbars[self.currPlot].ax.get_yticklabels():
                   label.set_fontsize(plotFormat.cticksFontSize)
 
@@ -1851,7 +1851,7 @@ def collectDataEditLabels(self, entries, master):
         
         master.destroy()
         ## Now replot with changes.....
-        self.canvas.show()
+        self.canvas.draw()
 
     def zoomButton(self):
         #Variable initializations
@@ -1980,7 +1980,7 @@ def zoomCollect(self, entries, master):
                     plot.set_xticks(xlabelPos)
 
         master.destroy()
-        self.canvas.show()
+        self.canvas.draw()
     
 
 class NaviPlotInfo:
@@ -2224,6 +2224,7 @@ def showData(self):
         
         countLines = 1
         for lines in self.file.readlines():
+            lines = lines.decode()
             self.textbox.insert(Tk.END, str(countLines) + '.   ' + lines, ('normal'))
             countLines += 1
         countLines -= 1
@@ -2232,7 +2233,7 @@ def showData(self):
         figure = Figure(figsize=(22,5), dpi = 70)
         self.histArea = FigureCanvasTkAgg(figure, master= bottomFrame)
         self.histArea.get_tk_widget().pack()
-        toolbar  = NavigationToolbar2TkAgg(self.histArea, toolbarFrame)
+        toolbar  = NavigationToolbar2Tk(self.histArea, toolbarFrame)
         toolbar.update()
         self.histogram = figure.add_subplot(111)
         cid = figure.canvas.mpl_connect('button_press_event',self.onclick)
@@ -2285,8 +2286,8 @@ def showData(self):
             count += 1
 
     def yview(self, *args):
-        apply(self.textbox.yview, args)
-        apply(self.statstextbox.yview, args)
+        self.textbox.yview(*args)
+        self.statstextbox.yview(*args)
         
     def onclick(self, event):
       if event.button == 3:
@@ -2298,6 +2299,7 @@ def onclick(self, event):
         self.textbox.delete(0.0, Tk.END)
         self.file = open(self.fileChosen, 'r')
         for lines in self.file.readlines():
+          lines=lines.decode()
           if (countLines < event.xdata - 1) or (countLines > event.xdata + 1):
             self.textbox.insert(Tk.END, str(countLines) + '.   ' + lines, ('normal'))
           else:
@@ -2317,8 +2319,8 @@ def onclick(self, event):
           
         
         
-        apply(self.textbox.yview, args)
-        apply(self.statstextbox.yview, args)
+        self.textbox.yview(*args)
+        self.statstextbox.yview(*args)
       
     def chooseFileCuda(self, *event):
       self.fileChosen = self.cAvailableCudaFiles.get('active')
@@ -2572,3 +2574,4 @@ def decreaseBinning(self):
             
             
         
+
diff --git a/aerialvision/lexyacc.py b/aerialvision/lexyacc.py
index d657383eb..53541ed44 100644
--- a/aerialvision/lexyacc.py
+++ b/aerialvision/lexyacc.py
@@ -82,7 +82,7 @@ def import_user_defined_variables(variables):
     try:
         file = open(os.path.join(userSettingPath, 'variables.txt'),'r')
     except:
-        print "No variables.txt file found."
+        print("No variables.txt file found.")
         return
 
     #this can be replaced with a proper lex-yacc parser later
@@ -96,7 +96,7 @@ def import_user_defined_variables(variables):
                 continue
 
             # parse the line containing definition of a stat variable
-            s = line.split(",")
+            s = line.split(',')
             statName = s[0]
             statVar = vc.variable('', 1, 0)
             statVar.importFromString(line)
@@ -104,8 +104,9 @@ def import_user_defined_variables(variables):
             # add parsed stat variable to the searchable map
             variables[statName] = statVar
             
-        except Exception, (e):
-            print "error:",e,", in variables.txt line:",line
+        except Exception as xxx_todo_changeme:
+            (e) = xxx_todo_changeme
+            print("error:",e,", in variables.txt line:",line)
 
 # Parses through a given log file for data
 def parseMe(filename):
@@ -136,7 +137,7 @@ def t_newline(t):
         t.lexer.lineno += t.value.count("\n")
         
     def t_error(t):
-        print "Illegal character '%s'" % t.value[0]
+        print("Illegal character '%s'" % t.value[0])
         t.lexer.skip(1) 
 
     lex.lex()
@@ -202,14 +203,14 @@ def t_error(t):
 
     # generate a lookup table based on the specified name in log file for each stat
     stat_lookuptable = {}
-    for name, var in variables.iteritems():
+    for name, var in variables.items():
         if (name == 'CFLOG'):
             continue;
         if (var.lookup_tag != ''):
             stat_lookuptable[var.lookup_tag] = var 
         else:
             stat_lookuptable[name.lower()] = var
-    
+
     inputData = 'NULL'
 
     # a table containing all the metrics that has received the missing data warning 
@@ -218,19 +219,19 @@ def t_error(t):
     def p_sentence(p):
         '''sentence : WORD NUMBERSEQUENCE'''
         #print p[0], p[1],p[2]
-        num = p[2].split(" ")  
+        num = p[2].split(' ')  
         
         # detect empty data entry for particular metric and print a warning 
         if p[2] == '': 
             if not p[1] in stat_missing_warned: 
-                print "WARNING: Sample entry for metric '%s' has no data. Skipping..." % p[1]
+                print("WARNING: Sample entry for metric '%s' has no data. Skipping..." % p[1])
                 stat_missing_warned[p[1]] = True
             return
 
         lookup_input = p[1].lower()
         if (lookup_input  in stat_lookuptable):
             if (lookup_input == "globalcyclecount") and (int(num[0]) % 10000 == 0):
-                print "Processing global cycle %s" % num[0]
+                print("Processing global cycle %s" % num[0])
                 
             stat = stat_lookuptable[lookup_input]
             if (stat.type == 1):
@@ -294,7 +295,7 @@ def p_sentence(p):
 
     def p_error(p):
         if p:
-            print("Syntax error at '%s'" % p.value)
+            print(("Syntax error at '%s'" % p.value))
         else:
             print("Syntax error at EOF")
     
@@ -306,11 +307,12 @@ def p_error(p):
     else:
         file = open(filename, 'r')
     while file:
-        line = file.readline()
+        line = file.readline().decode()
+
         if not line : break
-        nameNdata = line.split(":")
+        nameNdata = line.split(':')
         if (len(nameNdata) != 2): 
-            print("Syntax error at '%s'" % line) 
+            print(("Syntax error at '%s'" % line)) 
         namePart = nameNdata[0].strip()
         dataPart= nameNdata[1].strip()
         parts = [' ', namePart, dataPart]
@@ -323,3 +325,4 @@ def p_error(p):
 
 
 
+
diff --git a/aerialvision/lexyaccbookmark.py b/aerialvision/lexyaccbookmark.py
index 42c6b406e..7aa2f800f 100644
--- a/aerialvision/lexyaccbookmark.py
+++ b/aerialvision/lexyaccbookmark.py
@@ -108,7 +108,7 @@ def t_NOTHING(t):
 
         
     def t_error(t):
-        print "Illegal character '%s'" % t.value[0]
+        print("Illegal character '%s'" % t.value[0])
         t.lexer.skip(1) 
     
     lex.lex()    
@@ -150,7 +150,7 @@ def p_sentence(p):
             pass
 
         else:
-            print 'An Parsing Error has occurred'
+            print('An Parsing Error has occurred')
             
 
     
@@ -159,7 +159,7 @@ def p_sentence(p):
 
     def p_error(p):
         if p:
-            print("Syntax error at '%s'" % p.value)
+            print(("Syntax error at '%s'" % p.value))
         else:
             print("Syntax error at EOF")
     
@@ -168,7 +168,7 @@ def p_error(p):
     try:
         file = open(os.environ['HOME'] + '/.gpgpu_sim/aerialvision/bookmarks.txt', 'r')
         inputData = file.readlines()
-    except IOError,e:
+    except IOError as e:
         if e.errno == 2:
             inputData = ''
         else:
@@ -178,3 +178,4 @@ def p_error(p):
         yacc.parse(x[0:-1]) # ,debug=True)
         
     return listBookmarks
+
diff --git a/aerialvision/lexyacctexteditor.py b/aerialvision/lexyacctexteditor.py
index 51d3ced44..57b41db82 100644
--- a/aerialvision/lexyacctexteditor.py
+++ b/aerialvision/lexyacctexteditor.py
@@ -88,7 +88,7 @@ def t_newline(t):
         t.lexer.lineno += t.value.count("\n")
         
     def t_error(t):
-        print "Illegal character '%s'" % t.value[0]
+        print("Illegal character '%s'" % t.value[0])
         t.lexer.skip(1)
         
     lex.lex()
@@ -109,8 +109,8 @@ def p_sentence(p):
 
     def p_error(p):
       if p:
-          print("Syntax error at '%s'" % p.value)
-          print p
+          print(("Syntax error at '%s'" % p.value))
+          print(p)
       else:
           print("Syntax error at EOF")
 
@@ -152,17 +152,18 @@ def ptxToCudaMapping(filename):
       loc = int(m.group(2))
 
     count += 1
-  x = map.keys()
+  x = list(map.keys())
   return map
     
 
 #Unit test / playground
 def main():
     data = textEditorParseMe(sys.argv[1])
-    print data[100]
+    print(data[100])
    
 if __name__ == "__main__":
     main()
   
   
   
+
diff --git a/aerialvision/organizedata.py b/aerialvision/organizedata.py
index 090b90f13..f5d5312c3 100644
--- a/aerialvision/organizedata.py
+++ b/aerialvision/organizedata.py
@@ -99,7 +99,7 @@ def organizedata(fileVars):
     }
     data_type_char = {int:'I', float:'f'}
 
-    print "Organizing data into internal format..."
+    print("Organizing data into internal format...")
 
     # Organize globalCycle in advance because it is used as a reference
     if ('globalCycle' in fileVars):
@@ -107,28 +107,28 @@ def organizedata(fileVars):
         fileVars['globalCycle'].data = organizeFunction[statData.organize](statData.data, data_type_char[statData.datatype])
 
     # Organize other stat data into internal format
-    for statName, statData in fileVars.iteritems():
+    for statName, statData in fileVars.items():
         if (statName != 'CFLOG' and statName != 'globalCycle' and statData.organize != 'custom'):
             fileVars[statName].data = organizeFunction[statData.organize](statData.data, data_type_char[statData.datatype])
   
     # Custom routines to organize stat data into internal format
-    if fileVars.has_key('averagemflatency'):
+    if 'averagemflatency' in fileVars:
         zeros = []
         for count in range(len(fileVars['averagemflatency'].data),len(fileVars['globalCycle'].data)):
             zeros.append(0)
         fileVars['averagemflatency'].data = zeros + fileVars['averagemflatency'].data
 
-    if (skipCFLog == 0) and fileVars.has_key('CFLOG'):
+    if (skipCFLog == 0) and 'CFLOG' in fileVars:
         ptxFile = CFLOGptxFile
         statFile = CFLOGInsnInfoFile
         
-        print "PC Histogram to CUDA Src = %d" % convertCFLog2CUDAsrc
+        print("PC Histogram to CUDA Src = %d" % convertCFLog2CUDAsrc)
         parseCFLOGCUDA = convertCFLog2CUDAsrc
 
         if parseCFLOGCUDA == 1:
-            print "Obtaining PTX-to-CUDA Mapping from %s..." % ptxFile
+            print("Obtaining PTX-to-CUDA Mapping from %s..." % ptxFile)
             map = lexyacctexteditor.ptxToCudaMapping(ptxFile.rstrip())
-            print "Obtaining Program Range from %s..." % statFile
+            print("Obtaining Program Range from %s..." % statFile)
             maxStats = max(lexyacctexteditor.textEditorParseMe(statFile.rstrip()).keys())
 
         if parseCFLOGCUDA == 1:
@@ -136,7 +136,7 @@ def organizedata(fileVars):
             for lines in map:
                 for ptxLines in map[lines]:
                     newMap[ptxLines] = lines
-            print "    Total number of CUDA src lines = %s..." % len(newMap)
+            print("    Total number of CUDA src lines = %s..." % len(newMap))
             
             markForDel = []
             for ptxLines in newMap:
@@ -144,7 +144,7 @@ def organizedata(fileVars):
                     markForDel.append(ptxLines)
             for lines in markForDel:
                 del newMap[lines]
-            print "    Number of touched CUDA src lines = %s..." % len(newMap)
+            print("    Number of touched CUDA src lines = %s..." % len(newMap))
     
         fileVars['CFLOGglobalPTX'] = vc.variable('',2,0)
         fileVars['CFLOGglobalCUDA'] = vc.variable('',2,0)
@@ -152,7 +152,7 @@ def organizedata(fileVars):
         count = 0
         for iter in fileVars['CFLOG']:
 
-            print "Organizing data for %s" % iter
+            print("Organizing data for %s" % iter)
 
             fileVars[iter + 'PTX'] = fileVars['CFLOG'][iter]
             fileVars[iter + 'PTX'].data = CFLOGOrganizePTX(fileVars['CFLOG'][iter].data, fileVars['CFLOG'][iter].maxPC)
@@ -174,7 +174,7 @@ def organizedata(fileVars):
                             for columns in range(0, len(fileVars[iter + 'CUDA'].data[rows])): 
                                 fileVars['CFLOGglobalCUDA'].data[rows][columns] += fileVars[iter + 'CUDA'].data[rows][columns]
             except:
-                print "Error in generating globalCFLog data"
+                print("Error in generating globalCFLog data")
 
             count += 1
         del fileVars['CFLOG']
@@ -231,10 +231,10 @@ def nullOrganizedStackedBar(nullVar, datatype_c):
         for row in range (0,len(organized)):
             newy = array.array(datatype_c, [0 for col in range(newLen)])
             for col in range(0, len(organized[row])):
-                newcol = col / n_data
+                newcol = int(col / n_data)
                 newy[newcol] += organized[row][col]
             for col in range(0, len(newy)):
-                newy[col] /= n_data 
+                newy[col] = int(newy[col]/n_data) 
             organized[row] = newy
 
     return organized
@@ -320,15 +320,15 @@ def CFLOGOrganizeCuda(list, ptx2cudamap):
     nSamples = len(list[0])
 
     # create a dictionary of empty data array (one array per cuda source line)
-    for ptxline, cudaline in ptx2cudamap.iteritems():
-        if tmp.has_key(cudaline):
+    for ptxline, cudaline in ptx2cudamap.items():
+        if cudaline in tmp:
             pass
         else:
             tmp[cudaline] = [0 for lengthData in range(nSamples)]
 
 
     for cudaline in tmp:
-        for ptxLines, mapped_cudaline in ptx2cudamap.iteritems():
+        for ptxLines, mapped_cudaline in ptx2cudamap.items():
             if mapped_cudaline == cudaline:
                 for lengthData in range(nSamples):
                     tmp[cudaline][lengthData] += list[ptxLines][lengthData]
@@ -336,7 +336,7 @@ def CFLOGOrganizeCuda(list, ptx2cudamap):
     
     final = []           
     for iter in range(min(tmp.keys()),max(tmp.keys())):
-        if tmp.has_key(iter):
+        if iter in tmp:
             final.append(tmp[iter])            
         else:
             final.append([0 for lengthData in range(nSamples)])
@@ -356,3 +356,4 @@ def CFLOGOrganizeCuda(list, ptx2cudamap):
 #    return organized
 
 
+
diff --git a/aerialvision/parser.out b/aerialvision/parser.out
new file mode 100644
index 000000000..809874f58
--- /dev/null
+++ b/aerialvision/parser.out
@@ -0,0 +1,47 @@
+Created by PLY version 3.11 (http://www.dabeaz.com/ply)
+
+Grammar
+
+Rule 0     S' -> sentence
+Rule 1     sentence -> WORD NUMBERSEQUENCE
+
+Terminals, with rules where they appear
+
+NUMBERSEQUENCE       : 1
+WORD                 : 1
+error                : 
+
+Nonterminals, with rules where they appear
+
+sentence             : 0
+
+Parsing method: LALR
+
+state 0
+
+    (0) S' -> . sentence
+    (1) sentence -> . WORD NUMBERSEQUENCE
+
+    WORD            shift and go to state 2
+
+    sentence                       shift and go to state 1
+
+state 1
+
+    (0) S' -> sentence .
+
+
+
+state 2
+
+    (1) sentence -> WORD . NUMBERSEQUENCE
+
+    NUMBERSEQUENCE  shift and go to state 3
+
+
+state 3
+
+    (1) sentence -> WORD NUMBERSEQUENCE .
+
+    $end            reduce using rule 1 (sentence -> WORD NUMBERSEQUENCE .)
+
diff --git a/aerialvision/parsetab.py b/aerialvision/parsetab.py
new file mode 100644
index 000000000..47a38843c
--- /dev/null
+++ b/aerialvision/parsetab.py
@@ -0,0 +1,31 @@
+
+# parsetab.py
+# This file is automatically generated. Do not edit.
+# pylint: disable=W,C,R
+_tabversion = '3.10'
+
+_lr_method = 'LALR'
+
+_lr_signature = 'NUMBERSEQUENCE WORDsentence : WORD NUMBERSEQUENCE'
+    
+_lr_action_items = {'WORD':([0,],[2,]),'$end':([1,3,],[0,-1,]),'NUMBERSEQUENCE':([2,],[3,]),}
+
+_lr_action = {}
+for _k, _v in _lr_action_items.items():
+   for _x,_y in zip(_v[0],_v[1]):
+      if not _x in _lr_action:  _lr_action[_x] = {}
+      _lr_action[_x][_k] = _y
+del _lr_action_items
+
+_lr_goto_items = {'sentence':([0,],[1,]),}
+
+_lr_goto = {}
+for _k, _v in _lr_goto_items.items():
+   for _x, _y in zip(_v[0], _v[1]):
+       if not _x in _lr_goto: _lr_goto[_x] = {}
+       _lr_goto[_x][_k] = _y
+del _lr_goto_items
+_lr_productions = [
+  ("S' -> sentence","S'",1,None,None,None),
+  ('sentence -> WORD NUMBERSEQUENCE','sentence',2,'p_sentence','lexyacc.py',220),
+]
diff --git a/aerialvision/startup.py b/aerialvision/startup.py
index ae14fd394..d261c0c10 100644
--- a/aerialvision/startup.py
+++ b/aerialvision/startup.py
@@ -62,11 +62,11 @@
 
 
 import sys
-import Tkinter as Tk
+import tkinter as Tk
 import Pmw
 import lexyacc
 import guiclasses
-import tkFileDialog as Fd
+import tkinter.filedialog as Fd
 import organizedata
 import os
 import os.path
@@ -160,7 +160,7 @@ def fileInput(cl_files=None):
         tmprecentfile = tmprecentfile.split('/')
         for iter in range(1,len(tmprecentfile) - 1):
             recentfile = recentfile + '/' + tmprecentfile[iter]
-    except IOError,e:
+    except IOError as e:
         if e.errno == 2:
             # recentfiles.txt does not exist, ignore and use CWD
             recentfile = '.'
@@ -313,7 +313,7 @@ def loadRecentFile(entry):
     try: 
         loadfile = open(os.path.join(userSettingPath, 'recentfiles.txt'), 'r')
         recentfiles = loadfile.readlines()
-    except IOError,e:
+    except IOError as e:
         if e.errno == 2:
             recentfiles = ''
         else:
@@ -323,7 +323,7 @@ def loadRecentFile(entry):
     recentFileWindow.pack(side = Tk.TOP)
     scrollbar = Tk.Scrollbar(recentFileWindow, orient = Tk.VERTICAL)
     cRecentFile = Tk.Listbox(recentFileWindow, width = 100, height = 15, yscrollcommand = scrollbar.set)
-    cRecentFile.bind("<Double-Button-1>", lambda(event): recentFileInsert(entry, cRecentFile.get('active'), instance))
+    cRecentFile.bind("<Double-Button-1>", lambda event: recentFileInsert(entry, cRecentFile.get('active'), instance))
     cRecentFile.pack(side = Tk.LEFT)
     scrollbar.config(command = cRecentFile.yview)
     scrollbar.pack(side = Tk.LEFT, fill = Tk.Y)
@@ -391,9 +391,9 @@ def addListToListbox(listbox,list):
                 Filenames.append(string)
                 listbox.insert(Tk.END, string)
             else:
-                print 'Could not open file: ' + string
+                print('Could not open file: ' + string)
         except:
-            print 'Could not open file: ' + file
+            print('Could not open file: ' + file)
             
         
 def errorMsg(string):
@@ -447,6 +447,7 @@ def submitClicked(instance, num, skipcflog, cflog2cuda, listboxes):
     startup(res, [TEFiles, TEPTXFiles, TEStatFiles])
 
 def graphAddTab(vars, graphTabs,res, entry):
+
     
     TabsForGraphs.append(guiclasses.formEntry(graphTabs, str(len(TabsForGraphs) + 1), vars, res, entry))
     entry.delete(0, Tk.END)
@@ -586,7 +587,7 @@ def startup(res, TEFILES):
     organizedata.setCFLOGInfoFiles(TEFILES)
     for files in Filenames:
         vars[files] = organizedata.organizedata(vars[files])
-
+    
     graphAddTab(vars, graphTabs, res, eAddTab)
 
 
@@ -873,3 +874,4 @@ def manageFilesSubmit(window, listbox):
 
             
     
+
diff --git a/aerialvision/variableclasses.py b/aerialvision/variableclasses.py
index 18850a1ce..30d8d2d17 100644
--- a/aerialvision/variableclasses.py
+++ b/aerialvision/variableclasses.py
@@ -102,8 +102,9 @@ def importFromString(self, string_spec):
                 assert(self.organize == 'idx2DVec')
             elif (self.type == 5):
                 assert(self.organize == 'sparse')
-        except Exception, (e):
-            print "Error in creating new stat variable from string: %s" % string_spec
+        except Exception as xxx_todo_changeme:
+            (e) = xxx_todo_changeme
+            print("Error in creating new stat variable from string: %s" % string_spec)
             raise e
 
     def initSparseMatrix(self):
@@ -133,7 +134,7 @@ def loadLineStatName(filename):
     global lineStatName
     file = open(filename, 'r')
     while file:
-        line = file.readline()
+        line = file.readline().decode()
         if not line : break
         if (line.startswith('kernel line :')) :
             line = line.strip()
@@ -171,7 +172,7 @@ def takeMax(self,key):
         except:
             tmp = 0
             if cudaLineNo.debug:
-                print 'Exception in cudaLineNo.takeMax()', self.stats[key]
+                print('Exception in cudaLineNo.takeMax()', self.stats[key])
         return tmp
         
     def takeRatioSums(self, key1,key2):
@@ -182,9 +183,9 @@ def takeRatioSums(self, key1,key2):
             return tmp1/tmp2
         except:
             if cudaLineNo.debug:
-                print tmp1, tmp2
+                print(tmp1, tmp2)
             if tmp2 == 0 and cudaLineNo.debug:
-                print 'infinite'
+                print('infinite')
             return 0
     
         
@@ -209,7 +210,7 @@ def returnRatio(self, key1, key2):
             return tmp1/tmp2
         except:
             if tmp2 == 0 and ptxLineNo.debug:
-                print 'infinite'
+                print('infinite')
             return 0
             
     
@@ -221,3 +222,4 @@ def returnRatio(self, key1, key2):
     
     
     
+
diff --git a/bin/aerialvision.py b/bin/aerialvision.py
index a5b02f0fe..5cc7ad983 100755
--- a/bin/aerialvision.py
+++ b/bin/aerialvision.py
@@ -66,20 +66,21 @@
 import os
 
 if not os.environ['HOME']:
-	print 'please set your HOME environment variable to your home directory'
+	print('please set your HOME environment variable to your home directory')
 	sys.exit
 if not os.environ['GPGPUSIM_ROOT']:
-	print 'please set your GPGPUSIM_ROOT environment variable to your home directory'
+	print('please set your GPGPUSIM_ROOT environment variable to your home directory')
 	sys.exit
 
 sys.path.append( os.environ['GPGPUSIM_ROOT'] + '/aerialvision/' ) 
 
-import Tkinter as Tk
+import tkinter as Tk
 import Pmw
 import startup
 import time
 
-from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2TkAgg
+from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk
 from matplotlib.figure import Figure
 
 startup.fileInput(sys.argv[1:])
+
diff --git a/configs/tested-cfgs/SM2_GTX480/gpgpusim.config b/configs/tested-cfgs/SM2_GTX480/gpgpusim.config
index 609a9ef1b..bc01821db 100644
--- a/configs/tested-cfgs/SM2_GTX480/gpgpusim.config
+++ b/configs/tested-cfgs/SM2_GTX480/gpgpusim.config
@@ -56,7 +56,7 @@
 
 
 # In Fermi, the cache and shared memory can be configured to 16kb:48kb(default) or 48kb:16kb
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 -gpgpu_cache:dl1  N:32:128:4,L:L:m:N:H,S:64:8,8
diff --git a/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config b/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config
index c83159f5f..ef47ddfd9 100644
--- a/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config
+++ b/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config
@@ -99,7 +99,7 @@
 # Greedy then oldest scheduler
 -gpgpu_scheduler gto
 
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 # The defulat is to disable the L1 cache, unless cache modifieres are used
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
index 5b243a5b6..7d3e2d47e 100644
--- a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
+++ b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
@@ -1,3 +1,32 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
 # This config models the Pascal GP102 (NVIDIA TITAN X)
 # For more info about this card, see Nvidia White paper
 # http://international.download.nvidia.com/geforce-com/international/pdfs/GeForce_GTX_1080_Whitepaper_FINAL.pdf
@@ -28,6 +57,7 @@
 -gpgpu_n_cores_per_cluster 1
 -gpgpu_n_mem 12
 -gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
 
 # Pascal clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
@@ -93,7 +123,7 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 # The defulat is to disable the L1 cache, unless cache modifieres are used
@@ -170,11 +200,8 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Pascal 102
--power_simulation_enabled 0
 
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
 #-trace_sampling_core 0
-
diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 6fe04eecd..6ff4b6c08 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -1,8 +1,3 @@
-# This config models the Turing RTX 2060
-# For more info about turing architecture:
-# 1- https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/technologies/turing-architecture/NVIDIA-Turing-Architecture-Whitepaper.pdf
-# 2- "RTX on—The NVIDIA Turing GPU", IEEE MICRO 2020
-
 # functional simulator specification
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
@@ -14,6 +9,8 @@
 -gpgpu_runtime_sync_depth_limit 2
 -gpgpu_runtime_pending_launch_count_limit 2048
 -gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+-gpgpu_max_concurrent_kernel 128
 
 # Compute Capability
 -gpgpu_compute_capability_major 7
@@ -27,31 +24,27 @@
 -gpgpu_n_clusters 30
 -gpgpu_n_cores_per_cluster 1
 -gpgpu_n_mem 12
--gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_n_sub_partition_per_mchannel 2
 
-# volta clock domains
+# clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
--gpgpu_clock_domains 1365.0:1365.0:1365.0:3500.0
-# boost mode
-# -gpgpu_clock_domains 1680.0:1680.0:1680.0:3500.0
+-gpgpu_clock_domains 1365:1365:1365:3500.5
 
 # shader core pipeline config
 -gpgpu_shader_registers 65536
 -gpgpu_registers_per_block 65536
 -gpgpu_occupancy_sm_number 75
 
-# This implies a maximum of 32 warps/SM
--gpgpu_shader_core_pipeline 1024:32 
--gpgpu_shader_cta 32
+-gpgpu_shader_core_pipeline 1024:32
+-gpgpu_shader_cta 16
 -gpgpu_simd_model 1
 
 # Pipeline widths and number of FUs
 # ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
-## Turing has 4 SP SIMD units, 4 INT units, 4 SFU units, 8 Tensor core units
-## We need to scale the number of pipeline registers to be equal to the number of SP units
--gpgpu_pipeline_widths 4,0,4,4,4,4,0,4,4,4,8,4,4
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
 -gpgpu_num_sp_units 4
 -gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
 -gpgpu_num_int_units 4
 -gpgpu_tensor_core_avail 1
 -gpgpu_num_tensor_core_units 4
@@ -59,32 +52,18 @@
 # Instruction latencies and initiation intervals
 # "ADD,MAX,MUL,MAD,DIV"
 # All Div operations are executed on SFU unit
--ptx_opcode_latency_int 4,13,4,5,145,32
--ptx_opcode_initiation_int 2,2,2,2,8,4
--ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_latency_int 4,4,4,4,21
+-ptx_opcode_initiation_int 2,2,2,2,2
+-ptx_opcode_latency_fp 4,4,4,4,39
 -ptx_opcode_initiation_fp 2,2,2,2,4
--ptx_opcode_latency_dp 8,19,8,8,330
--ptx_opcode_initiation_dp 4,4,4,4,130
--ptx_opcode_latency_sfu 100
+-ptx_opcode_latency_dp 64,64,64,64,330
+-ptx_opcode_initiation_dp 64,64,64,64,130
+-ptx_opcode_latency_sfu 21
 -ptx_opcode_initiation_sfu 8
 -ptx_opcode_latency_tesnor 64
 -ptx_opcode_initiation_tensor 64
 
-# Turing has four schedulers per core
--gpgpu_num_sched_per_core 4
-# Greedy then oldest scheduler
--gpgpu_scheduler gto
-## In Turing, a warp scheduler can issue 1 inst per cycle
--gpgpu_max_insn_issue_per_warp 1
--gpgpu_dual_issue_diff_exec_units 1
-
-# shared memory bankconflict detection 
--gpgpu_shmem_num_banks 32
--gpgpu_shmem_limited_broadcast 0
--gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 75
-
-# Trung has sub core model, in which each scheduler has its own register file and EUs
+# sub core model: in which each scheduler has its own register file and EUs
 # i.e. schedulers are isolated
 -gpgpu_sub_core_model 1
 # disable specialized operand collectors and use generic operand collectors instead
@@ -92,26 +71,46 @@
 -gpgpu_operand_collector_num_units_gen 8
 -gpgpu_operand_collector_num_in_ports_gen 8
 -gpgpu_operand_collector_num_out_ports_gen 8
-# turing has 8 banks dual-port, 4 schedulers, two banks per scheduler
-# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
--gpgpu_num_reg_banks 16
+# register banks
+-gpgpu_num_reg_banks 8
 -gpgpu_reg_file_port_throughput 2
 
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# warp scheduling
+-gpgpu_num_sched_per_core 4
+-gpgpu_scheduler lrr
+# a warp scheduler issue mode
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
--gpgpu_adaptive_cache_config 0
+# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 32,64
+-gpgpu_unified_l1d_size 96
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:512,L:L:s:N:L,A:256:8,16:0,32
--gpgpu_shmem_size 65536
--gpgpu_shmem_sizeDefault 65536
--gpgpu_shmem_per_block 65536
+-gpgpu_cache:dl1 S:4:128:64,L:T:m:L:L,A:256:32,16:0,32
+-gpgpu_l1_latency 32
 -gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
--gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_cache_write_ratio 25
+
+# shared memory  configuration
+-gpgpu_shmem_size 65536
+-gpgpu_shmem_sizeDefault 65536
+-gpgpu_shmem_per_block 49152
+-gpgpu_smem_latency 30
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 75
 
-# 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 3MB L2 cache
+# L2 cache
 -gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
@@ -122,34 +121,31 @@
 -gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
 -gpgpu_inst_fetch_throughput 4
 # 128 KB Tex
-# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
 -gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
 # 64 KB Const
 -gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
 -gpgpu_perfect_inst_const_cache 1
 
 # interconnection
-#-network_mode 1 
-#-inter_config_file config_turing_islip.icnt
 # use built-in local xbar
 -network_mode 2
 -icnt_in_buffer_limit 512
 -icnt_out_buffer_limit 512
 -icnt_subnets 2
--icnt_arbiter_algo 1
 -icnt_flit_size 40
+-icnt_arbiter_algo 1
 
 # memory partition latency config 
--gpgpu_l2_rop_latency 160
--dram_latency 100
+-gpgpu_l2_rop_latency 194
+-dram_latency 96
 
-# dram model config
+# dram sched config
 -gpgpu_dram_scheduler 1
 -gpgpu_frfcfs_dram_sched_queue_size 64
 -gpgpu_dram_return_queue_size 192
 
-# Turing has GDDR6
-# http://monitorinsider.com/GDDR6.html
+# dram model config
 -gpgpu_n_mem_per_ctrlr 1
 -gpgpu_dram_buswidth 2
 -gpgpu_dram_burst_length 16
@@ -157,9 +153,9 @@
 -gpgpu_mem_address_mask 1
 -gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
 
-# Use the same GDDR5 timing, scaled to 3500MHZ
--gpgpu_dram_timing_opt "nbk=16:CCD=4:RRD=10:RCD=20:RAS=50:RP=20:RC=62:
-                        CL=20:WL=8:CDLR=9:WR=20:nbkgrp=4:CCDL=4:RTPL=4"
+# Mem timing 
+-gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4
+-dram_dual_bus_interface 0
 
 # select lower bits for bnkgrp to increase bnkgrp parallelism
 -dram_bnk_indexing_policy 0
@@ -174,11 +170,10 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Volta
+# power model configs, disable it untill we create a real energy model
 -power_simulation_enabled 0
 
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
-
+#-trace_sampling_core 0
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/config_turing_islip.icnt b/configs/tested-cfgs/SM75_RTX2060_S/config_turing_islip.icnt
new file mode 100644
index 000000000..eed1c34b6
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/config_turing_islip.icnt
@@ -0,0 +1,73 @@
+//52*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 52;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 64;
+input_buffer_size = 256;
+ejection_buffer_size = 64;
+boundary_buffer_size = 64;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
new file mode 100644
index 000000000..08ac75277
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
@@ -0,0 +1,211 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+# This config models the Turing RTX 2060 Super
+# For more info about turing architecture:
+# 1- https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/technologies/turing-architecture/NVIDIA-Turing-Architecture-Whitepaper.pdf
+# 2- "RTX on—The NVIDIA Turing GPU", IEEE MICRO 2020
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 75
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_max_concurrent_kernel 128
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 5
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 34
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 16
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1905.0:1905.0:1905.0:3500.0
+# boost mode
+# -gpgpu_clock_domains 1680.0:1680.0:1680.0:3500.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 75
+
+# This implies a maximum of 32 warps/SM
+-gpgpu_shader_core_pipeline 1024:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Turing has 4 SP SIMD units, 4 INT units, 4 SFU units, 8 Tensor core units
+## We need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,0,4,4,4,4,0,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,32
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Turing has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Turing, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 75
+
+# Trung has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# turing has 8 banks dual-port, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+-gpgpu_adaptive_cache_config 0
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1  S:1:128:512,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_shmem_size 65536
+-gpgpu_shmem_sizeDefault 65536
+-gpgpu_shmem_per_block 65536
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_latency 20
+-gpgpu_smem_latency 20
+-gpgpu_flush_l1_cache 1
+
+# 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 4MB L2 cache
+-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 0
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_turing_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_arbiter_algo 1
+-icnt_flit_size 40
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# Turing has GDDR6
+# http://monitorinsider.com/GDDR6.html
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 2
+-gpgpu_dram_burst_length 16
+-dram_data_command_freq_ratio 4
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
+
+# Use the same GDDR5 timing, scaled to 3500MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=4:RRD=10:RCD=20:RAS=50:RP=20:RC=62:
+                        CL=20:WL=8:CDLR=9:WR=20:nbkgrp=4:CCDL=4:RTPL=4"
+
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
diff --git a/configs/tested-cfgs/SM7_GV100/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_GV100/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_GV100/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_GV100/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_GV100/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_GV100/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_GV100/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_GV100/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_GV100/config_volta_islip.icnt b/configs/tested-cfgs/SM7_GV100/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_GV100/gpgpusim.config b/configs/tested-cfgs/SM7_GV100/gpgpusim.config
new file mode 100644
index 000000000..26ce0eb58
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/gpgpusim.config
@@ -0,0 +1,237 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+-gpgpu_max_concurrent_kernel 128
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Loose round robin scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index c4818d10f..b3384afcb 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -1,4 +1,34 @@
-# This config models the Volta
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
 # For more info about volta architecture:
 # http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
 # https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
@@ -20,6 +50,7 @@
 -gpgpu_runtime_pending_launch_count_limit 2048
 -gpgpu_kernel_launch_latency 5000
 -gpgpu_TB_launch_latency 0
+-gpgpu_max_concurrent_kernel 128
 
 # Compute Capability
 -gpgpu_compute_capability_major 7
@@ -34,6 +65,7 @@
 -gpgpu_n_cores_per_cluster 1
 -gpgpu_n_mem 32
 -gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
 
 # volta clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
@@ -94,18 +126,18 @@
 -gpgpu_shmem_num_banks 32
 -gpgpu_shmem_limited_broadcast 0
 -gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 60
+-gpgpu_coalesce_arch 70
 
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
 # Greedy then oldest scheduler
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 ## In Volta, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 # Defualt config is 32KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
@@ -113,17 +145,21 @@
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
-# Volta unified cache has four banks
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
@@ -195,11 +231,7 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Volta
--power_simulation_enabled 0
-
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
-
+#-trace_sampling_core 0
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 3fa51ee14..c37aaf053 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -18,6 +18,7 @@
 -gpgpu_heap_size_limit 8388608
 -gpgpu_runtime_sync_depth_limit 2
 -gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_max_concurrent_kernel 128
 
 # Compute Capability
 -gpgpu_compute_capability_major 7
@@ -100,13 +101,13 @@
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
 # Greedy then oldest scheduler
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 ## In Volta, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 # Defualt config is 32KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
@@ -114,17 +115,21 @@
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
-# Volta unified cache has four banks
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_l1_latency 20
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 4.5MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
@@ -196,9 +201,6 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Volta
--power_simulation_enabled 0
-
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
diff --git a/configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt b/configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt
new file mode 100644
index 000000000..6775d5d6f
--- /dev/null
+++ b/configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 78;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
new file mode 100644
index 000000000..d26b1a621
--- /dev/null
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -0,0 +1,180 @@
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 86
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+-gpgpu_max_concurrent_kernel 128
+
+# Compute Capability
+-gpgpu_compute_capability_major 8
+-gpgpu_compute_capability_minor 6
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 46
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 16
+-gpgpu_n_sub_partition_per_mchannel 2
+
+# clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1132:1132:1132:3500.5
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 86
+
+-gpgpu_shader_core_pipeline 1536:32
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,4,4,4,21
+-ptx_opcode_initiation_int 2,2,2,2,2
+-ptx_opcode_latency_fp 4,4,4,4,39
+-ptx_opcode_initiation_fp 1,1,1,1,2
+-ptx_opcode_latency_dp 64,64,64,64,330
+-ptx_opcode_initiation_dp 64,64,64,64,130
+-ptx_opcode_latency_sfu 21
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# sub core model: in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# register banks
+-gpgpu_num_reg_banks 8
+-gpgpu_reg_file_port_throughput 2
+
+# warp scheduling
+-gpgpu_num_sched_per_core 4
+-gpgpu_scheduler lrr
+# a warp scheduler issue mode
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,100
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1 S:4:128:256,L:T:m:L:L,A:384:48,16:0,32
+-gpgpu_l1_latency 39
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_cache_write_ratio 25
+
+# shared memory  configuration
+-gpgpu_shmem_size 102400
+-gpgpu_shmem_sizeDefault 102400
+-gpgpu_shmem_per_block 49152
+-gpgpu_smem_latency 29
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 86
+
+# L2 cache
+-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 187
+-dram_latency 254
+
+# dram sched config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# dram model config
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 2
+-gpgpu_dram_burst_length 16
+-dram_data_command_freq_ratio 4
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
+
+# Mem timing 
+-gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4
+-dram_dual_bus_interface 0
+
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# power model configs, disable it untill we create a real energy model
+-power_simulation_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
diff --git a/cuobjdump_to_ptxplus/Makefile b/cuobjdump_to_ptxplus/Makefile
index e95136a95..0eb7d1e1d 100644
--- a/cuobjdump_to_ptxplus/Makefile
+++ b/cuobjdump_to_ptxplus/Makefile
@@ -28,10 +28,10 @@ $(OUTPUT_DIR)/cuobjdump_to_ptxplus: $(OUTPUT_DIR)/cuobjdumpInst.o $(OUTPUT_DIR)/
 
 
 $(OUTPUT_DIR)/lex.ptx_.c : ptx.l
-	${LEX} ${LEXFLAGS} -o$(OUTPUT_DIR)/lex.ptx_.c ptx.l
+	${LEX} ${LEXFLAGS} -o$(OUTPUT_DIR)/lex.ptx_.c ptx.l 2> /dev/null
 
 $(OUTPUT_DIR)/ptx.tab.c : ptx.y
-	${YACC} ${YFLAGS} --name-prefix=ptx_ -v ptx.y  --file-prefix=$(OUTPUT_DIR)/ptx
+	${YACC} ${YFLAGS} --name-prefix=ptx_ -v ptx.y  --file-prefix=$(OUTPUT_DIR)/ptx 2> /dev/null
 
 $(OUTPUT_DIR)/ptx.tab.h :$(OUTPUT_DIR)/ptx.tab.c
 
diff --git a/cuobjdump_to_ptxplus/cuobjdumpInstList.cc b/cuobjdump_to_ptxplus/cuobjdumpInstList.cc
index 32834c745..d42e59e51 100644
--- a/cuobjdump_to_ptxplus/cuobjdumpInstList.cc
+++ b/cuobjdump_to_ptxplus/cuobjdumpInstList.cc
@@ -505,7 +505,7 @@ std::string cuobjdumpInstList::parseCuobjdumpRegister(std::string reg, bool lo,
 	} else {
 		output("ERROR: unknown register type.\n");
 		printf("\nERROR: unknown register type: ");
-		printf(reg.c_str());
+		printf("%s",reg.c_str());
 		printf("\n");
 		assert(0);
 	}
diff --git a/cuobjdump_to_ptxplus/cuobjdump_to_ptxplus.cc b/cuobjdump_to_ptxplus/cuobjdump_to_ptxplus.cc
index 82dcb7cad..5c6fdcd1b 100644
--- a/cuobjdump_to_ptxplus/cuobjdump_to_ptxplus.cc
+++ b/cuobjdump_to_ptxplus/cuobjdump_to_ptxplus.cc
@@ -54,7 +54,7 @@ FILE *ptxplus_out;
 void output(const char * text)
 {
 	//printf(text);
-	fprintf(ptxplus_out, text);
+	fprintf(ptxplus_out,"%s", text);
 }
 
 void output(const std::string text) {
diff --git a/format-code.sh b/format-code.sh
index fb1cc909a..acd33ab1c 100755
--- a/format-code.sh
+++ b/format-code.sh
@@ -1,5 +1,6 @@
 # This bash script formats GPGPU-Sim using clang-format
 THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )"
+echo "Running clang-format on $THIS_DIR"
 clang-format -i ${THIS_DIR}/libcuda/*.h
 clang-format -i ${THIS_DIR}/libcuda/*.cc
 clang-format -i ${THIS_DIR}/src/*.h
@@ -8,8 +9,5 @@ clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.h
 clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.cc
 clang-format -i ${THIS_DIR}/src/cuda-sim/*.h
 clang-format -i ${THIS_DIR}/src/cuda-sim/*.cc
-clang-format -i ${THIS_DIR}/src/gpuwattch/*.h
-clang-format -i ${THIS_DIR}/src/gpuwattch/*.cc
-clang-format -i ${THIS_DIR}/src/trace-driven/*.h
-clang-format -i ${THIS_DIR}/src/trace-driven/*.cc
-clang-format -i ${THIS_DIR}/src/trace-driven/ISA_Def/*.h
+clang-format -i ${THIS_DIR}/src/accelwattch/*.h
+clang-format -i ${THIS_DIR}/src/accelwattch/*.cc
\ No newline at end of file
diff --git a/gpgpusim_check.cmake b/gpgpusim_check.cmake
new file mode 100644
index 000000000..486d66dc9
--- /dev/null
+++ b/gpgpusim_check.cmake
@@ -0,0 +1,135 @@
+# Dependency checking
+# Unset FindCUDA variables so that it 
+# gets reconfigured 
+include(gpgpusim_unset_cuda.cmake)
+
+find_package(Git REQUIRED)
+find_package(BISON REQUIRED)
+find_package(FLEX REQUIRED)
+find_package(ZLIB REQUIRED)
+find_package(CUDAToolkit REQUIRED)
+find_package(Doxygen)
+find_package(Python3)
+
+# GPGPU-Sim additional checking and info
+message(CHECK_START "Additional settings for ${CMAKE_PROJECT_NAME}")
+list(APPEND CMAKE_MESSAGE_INDENT "  ")
+
+# Check for OS
+message(CHECK_START "Checking for OS")
+if((NOT APPLE) AND (NOT UNIX) AND (NOT LINUX))
+    message(FATAL_ERROR "${CMAKE_SYSTEM_NAME} not supported")
+else()
+    message(CHECK_PASS ${CMAKE_SYSTEM_NAME})
+endif()
+
+# Check for version
+message(CHECK_START "Checking GPGPU-Sim version")
+message(CHECK_PASS "${CMAKE_PROJECT_VERSION}")
+
+# Check for git commit hash
+message(CHECK_START "Checking git commit hash")
+# Get the latest abbreviated commit hash of the working branch
+execute_process(
+    COMMAND git log -1 --format=%H
+    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+    OUTPUT_VARIABLE GPGPUSIM_CONFIG_GIT_HASH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE GPGPUSIM_CHECK_GIT_HASH
+)
+if(${GPGPUSIM_CHECK_GIT_HASH})
+    message(CHECK_FAIL "not a git repo")
+else()
+    message(CHECK_PASS "${GPGPUSIM_CONFIG_GIT_HASH}")
+endif()
+
+# Check for compiler and version
+message(CHECK_START "Checking CXX compiler")
+if(NOT (${CMAKE_CXX_COMPILER_ID} STREQUAL GNU))
+    message(CHECK_FAIL "GPGPU-Sim only tested with GCC: ${CMAKE_CXX_COMPILER_ID}")
+else()
+    message(CHECK_PASS "${CMAKE_CXX_COMPILER}")
+endif()
+message(CHECK_START "Checking CXX compiler version")
+message(CHECK_PASS "${CMAKE_CXX_COMPILER_VERSION}")
+set(GPGPSIM_CC_VERSION )
+
+# Check for CUDA nvcc and version 
+# Check already done with find_package, here just to display the path and version
+message(CHECK_START "Checking CUDA compiler")
+if(NOT CUDAToolkit_FOUND)
+    message(CHECK_FAIL "not found")
+else()
+    message(CHECK_PASS "${CUDAToolkit_NVCC_EXECUTABLE}")
+    message(CHECK_START "Checking CUDA compiler version")
+    message(CHECK_PASS "${CUDAToolkit_VERSION}")
+    if((CUDAToolkit_VERSION VERSION_LESS 2.0.3) OR (CUDAToolkit_VERSION VERSION_GREATER 11.10.0))
+        message(FATAL_ERROR "GPGPU-Sim ${CMAKE_PROJECT_VERSION} not tested with CUDA version ${CUDAToolkit_VERSION} (please see README)")
+    endif()
+endif()
+
+# Check for Power model
+# TODO How to configure the project to look for it?
+message(CHECK_START "Checking for GPGPU-Sim power model")
+if(IS_DIRECTORY ${PROJECT_SOURCE_DIR}/src/accelwattch)
+    if(NOT EXISTS ${PROJECT_SOURCE_DIR}/src/accelwattch/gpgpu_sim.verify)
+        message(FATAL_ERROR "gpgpu_sim.verify not found in ${PROJECT_SOURCE_DIR}/src/accelwattch/")
+    endif()
+    message(CHECK_PASS "${PROJECT_SOURCE_DIR}/src/accelwattch/")
+    set(GPGPUSIM_USE_POWER_MODEL True)
+    set(GPGPUSIM_POWER_MODEL ${PROJECT_SOURCE_DIR}/src/accelwattch)
+elseif(DEFINED ${GPGPUSIM_POWER_MODEL})
+    if(NOT EXISTS ${GPGPUSIM_POWER_MODEL}/gpgpu_sim.verify)
+        message(FATAL_ERROR "gpgpu_sim.verify not found in ${GPGPUSIM_POWER_MODEL} - Either incorrect directory or incorrect McPAT version")
+    endif()
+    message(CHECK_PASS "${GPGPUSIM_POWER_MODEL}")
+    set(GPGPUSIM_USE_POWER_MODEL True)
+else()
+    message(CHECK_PASS "configured without a power model")
+endif()
+
+# Set Build path
+# Get CUDA version
+set(CUDA_VERSION_STRING "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}")
+# execute_process(
+#     COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} --version
+#     COMMAND awk "/release/ {print $5;}"
+#     COMMAND sed "s/,//"
+#     WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+#     OUTPUT_VARIABLE CUDA_VERSION_STRING
+#     OUTPUT_STRIP_TRAILING_WHITESPACE
+# )
+
+# CMake cannot do formatted string output, so we just use the good old `awk`
+# math(EXPR CUDA_VERSION_NUMBER_MAJOR "${CUDAToolkit_VERSION_MAJOR} * 10")
+# math(EXPR CUDA_VERSION_NUMBER_MINOR "${CUDAToolkit_VERSION_MINOR} * 10")
+# set(CUDA_VERSION_NUMBER "${CUDA_VERSION_NUMBER_MAJOR}${CUDA_VERSION_NUMBER_MINOR}")
+execute_process(
+    COMMAND echo ${CUDA_VERSION_STRING}
+    COMMAND sed "s/\\./ /"
+    COMMAND awk "{printf(\"%02u%02u\", 10*int($1), 10*$2);}"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+    OUTPUT_VARIABLE CUDA_VERSION_NUMBER
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# Get debug or release
+# Set with -DCMAKE_BUILD_TYPE=Debug|Release to change build type
+message(CHECK_START "Checking for CMAKE_BUILD_TYPE")
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+    set(GPGPUSIM_BUILD_MODE "release" CACHE STRING "" FORCE)
+else()
+    string(TOLOWER "${CMAKE_BUILD_TYPE}" GPGPUSIM_BUILD_MODE)
+endif()
+message(CHECK_PASS "${CMAKE_BUILD_TYPE}")
+# TODO: Make this step an installation phase that handle copying so and creating symlinks
+message(STATUS "Setting binary directory to ${CMAKE_BINARY_DIR}")
+
+# TODO OpenCL check/support?
+
+list(POP_BACK CMAKE_MESSAGE_INDENT)
+message(CHECK_PASS "done")
+message(STATUS "Be sure to run 'source setup' "
+               "before you run CUDA program with GPGPU-Sim or building with external "
+               "simulator like SST")
\ No newline at end of file
diff --git a/gpgpusim_gen_build_string.cmake b/gpgpusim_gen_build_string.cmake
new file mode 100644
index 000000000..4559570c4
--- /dev/null
+++ b/gpgpusim_gen_build_string.cmake
@@ -0,0 +1,27 @@
+# Get hash
+execute_process(
+    COMMAND git log -1 --format=%h
+    WORKING_DIRECTORY ${INPUT_DIR}
+    OUTPUT_VARIABLE GPGPUSIM_GIT_HASH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# Get diff
+execute_process(
+    COMMAND git diff --numstat
+    COMMAND wc
+    COMMAND sed -re "s/^\\s+([0-9]+).*/\\1./"
+    WORKING_DIRECTORY ${INPUT_DIR}
+    OUTPUT_VARIABLE GPGPUSIM_GIT_DIFF
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+execute_process(
+    COMMAND git diff --numstat --staged
+    COMMAND wc
+    COMMAND sed -re "s/^\\s+([0-9]+).*/\\1./"
+    WORKING_DIRECTORY ${INPUT_DIR}
+    OUTPUT_VARIABLE GPGPUSIM_GIT_DIFF_STAGED
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+set(GPGPUSIM_BUILD_STRING "gpgpu-sim_git-commit-${GPGPUSIM_GIT_HASH}_modified_${GPGPUSIM_GIT_DIFF}${GPGPUSIM_GIT_DIFF_STAGED}")
+configure_file(${INPUT_DIR}/version.in ${OUTPUT_DIR}/detailed_version)
diff --git a/gpgpusim_gen_setup_environment.cmake b/gpgpusim_gen_setup_environment.cmake
new file mode 100644
index 000000000..e74a7f5c4
--- /dev/null
+++ b/gpgpusim_gen_setup_environment.cmake
@@ -0,0 +1,31 @@
+# Need to create a setup script to set some variables for others to interact with
+set(SETUP_SCRIPT_FILENAME "setup")
+message(STATUS "Writing setup commands to '${SETUP_SCRIPT_FILENAME}'")
+file(WRITE ${SETUP_SCRIPT_FILENAME} "export GPGPUSIM_SETUP_ENVIRONMENT_WAS_RUN=1\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export GPGPUSIM_ROOT=${PROJECT_SOURCE_DIR}\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export GPGPUSIM_CONFIG=${GPGPUSIM_CONFIG}\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export CUDA_INSTALL_PATH=${CUDAToolkit_TARGET_DIR}\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export PATH=`echo $PATH | sed 's#$GPGPUSIM_ROOT/bin:$CUDA_INSTALL_PATH/bin:##'`\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export PATH=$GPGPUSIM_ROOT/bin:$CUDA_INSTALL_PATH/bin:$PATH\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export CUDA_VERSION_NUMBER=${CUDA_VERSION_NUMBER}\n")
+if(CUDA_VERSION_NUMBER GREATER_EQUAL 6000)
+    file(APPEND ${SETUP_SCRIPT_FILENAME} "export PTX_SIM_USE_PTX_FILE=1.ptx\n")
+    file(APPEND ${SETUP_SCRIPT_FILENAME} "export PTX_SIM_KERNELFILE=_1.ptx\n")
+    file(APPEND ${SETUP_SCRIPT_FILENAME} "export CUOBJDUMP_SIM_FILE=jj\n")
+endif()
+# TODO What about OpenCL support?
+
+# setting LD_LIBRARY_PATH as follows enables GPGPU-Sim to be invoked by 
+# native CUDA and OpenCL applications. GPGPU-Sim is dynamically linked
+# against instead of the CUDA toolkit.  This replaces this cumbersome
+# static link setup in prior GPGPU-Sim releases.
+# Create a softlink for backward support
+if(APPLE)
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export DYLD_LIBRARY_PATH=`echo $DYLD_LIBRARY_PATH | sed -Ee 's#'$GPGPUSIM_ROOT'\/lib\/[0-9]+\/(debug|release):##'`\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export DYLD_LIBRARY_PATH=$GPGPUSIM_ROOT/lib/$GPGPUSIM_CONFIG:$DYLD_LIBRARY_PATH\n")
+else()
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export LD_LIBRARY_PATH=`echo $LD_LIBRARY_PATH | sed -re 's#'$GPGPUSIM_ROOT'\/lib\/[0-9]+\/(debug|release):##'`\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export LD_LIBRARY_PATH=$GPGPUSIM_ROOT/lib/$GPGPUSIM_CONFIG:$LD_LIBRARY_PATH\n")
+endif()
+
+# TODO ignore the OPENCL_REMOTE_GPU_HOST part?
\ No newline at end of file
diff --git a/gpgpusim_install.cmake b/gpgpusim_install.cmake
new file mode 100644
index 000000000..1590bf369
--- /dev/null
+++ b/gpgpusim_install.cmake
@@ -0,0 +1,2 @@
+# TODO Create the build/gcc-X.X/cuda-XXXX/release folder and put so to it
+# TODO Also create symlinks to the libcudart.so
\ No newline at end of file
diff --git a/gpgpusim_unset_cuda.cmake b/gpgpusim_unset_cuda.cmake
new file mode 100644
index 000000000..4eaef9107
--- /dev/null
+++ b/gpgpusim_unset_cuda.cmake
@@ -0,0 +1,60 @@
+# Unset these variable to force a re-search on possible CUDA version changes
+unset(CUDAToolkit_BIN_DIR CACHE)
+unset(CUDAToolkit_CUPTI_INCLUDE_DIR CACHE)
+unset(CUDAToolkit_NVCC_EXECUTABLE CACHE)
+unset(CUDAToolkit_rt_LIBRARY CACHE)
+unset(CUDA_CUDART CACHE)
+unset(CUDA_OpenCL_LIBRARY CACHE)
+unset(CUDA_cublasLt_LIBRARY CACHE)
+unset(CUDA_cublasLt_static_LIBRARY CACHE)
+unset(CUDA_cublas_LIBRARY CACHE)
+unset(CUDA_cublas_static_LIBRARY CACHE)
+unset(CUDA_cuda_driver_LIBRARY CACHE)
+unset(CUDA_cudart_LIBRARY CACHE)
+unset(CUDA_cudart_static_LIBRARY CACHE)
+unset(CUDA_cufft_LIBRARY CACHE)
+unset(CUDA_cufft_static_LIBRARY CACHE)
+unset(CUDA_cufft_static_nocallback_LIBRARY CACHE)
+unset(CUDA_cufftw_LIBRARY CACHE)
+unset(CUDA_cufftw_static_LIBRARY CACHE)
+unset(CUDA_culibos_LIBRARY CACHE)
+unset(CUDA_cupti_LIBRARY CACHE)
+unset(CUDA_cupti_static_LIBRARY CACHE)
+unset(CUDA_curand_LIBRARY CACHE)
+unset(CUDA_curand_static_LIBRARY CACHE)
+unset(CUDA_cusolver_LIBRARY CACHE)
+unset(CUDA_cusolver_lapack_static_LIBRARY CACHE)
+unset(CUDA_cusolver_static_LIBRARY CACHE)
+unset(CUDA_cusparse_LIBRARY CACHE)
+unset(CUDA_cusparse_static_LIBRARY CACHE)
+unset(CUDA_nppc_LIBRARY CACHE)
+unset(CUDA_nppc_static_LIBRARY CACHE)
+unset(CUDA_nppial_LIBRARY CACHE)
+unset(CUDA_nppial_static_LIBRARY CACHE)
+unset(CUDA_nppicc_LIBRARY CACHE)
+unset(CUDA_nppicc_static_LIBRARY CACHE)
+unset(CUDA_nppicom_LIBRARY CACHE)
+unset(CUDA_nppicom_static_LIBRARY CACHE)
+unset(CUDA_nppidei_LIBRARY CACHE)
+unset(CUDA_nppidei_static_LIBRARY CACHE)
+unset(CUDA_nppif_LIBRARY CACHE)
+unset(CUDA_nppif_static_LIBRARY CACHE)
+unset(CUDA_nppig_LIBRARY CACHE)
+unset(CUDA_nppig_static_LIBRARY CACHE)
+unset(CUDA_nppim_LIBRARY CACHE)
+unset(CUDA_nppim_static_LIBRARY CACHE)
+unset(CUDA_nppist_LIBRARY CACHE)
+unset(CUDA_nppist_static_LIBRARY CACHE)
+unset(CUDA_nppisu_LIBRARY CACHE)
+unset(CUDA_nppisu_static_LIBRARY CACHE)
+unset(CUDA_nppitc_LIBRARY CACHE)
+unset(CUDA_nppitc_static_LIBRARY CACHE)
+unset(CUDA_npps_LIBRARY CACHE)
+unset(CUDA_npps_static_LIBRARY CACHE)
+unset(CUDA_nvToolsExt_LIBRARY CACHE)
+unset(CUDA_nvgraph_LIBRARY CACHE)
+unset(CUDA_nvgraph_static_LIBRARY CACHE)
+unset(CUDA_nvjpeg_LIBRARY CACHE)
+unset(CUDA_nvjpeg_static_LIBRARY CACHE)
+unset(CUDA_nvml_LIBRARY CACHE)
+unset(CUDA_nvrtc_LIBRARY CACHE)
\ No newline at end of file
diff --git a/libcuda/CMakeLists.txt b/libcuda/CMakeLists.txt
new file mode 100644
index 000000000..c4ba4d181
--- /dev/null
+++ b/libcuda/CMakeLists.txt
@@ -0,0 +1,26 @@
+# Specify Flex and Bison target
+BISON_TARGET(cuobjdump_parser cuobjdump.y ${CMAKE_CURRENT_BINARY_DIR}/cuobjdump_parser.c
+            COMPILE_FLAGS "-t -d -v --report=all -p cuobjdump_ --file-prefix=${CMAKE_CURRENT_BINARY_DIR}/cuobjdump")
+FLEX_TARGET(cuobjdump_lexer cuobjdump.l ${CMAKE_CURRENT_BINARY_DIR}/cuobjdump_lexer.c
+            COMPILE_FLAGS "-B -P cuobjdump_")
+ADD_FLEX_BISON_DEPENDENCY(cuobjdump_lexer cuobjdump_parser)
+
+# Set generated source files to CXX
+set_source_files_properties(${BISON_cuobjdump_parser_OUTPUT_SOURCE} 
+                            ${FLEX_cuobjdump_lexer_OUTPUTS}
+                            PROPERTIES LANGUAGE CXX)
+
+# Create libcuda.a with all source files
+add_library(cuda STATIC 
+    cuda_runtime_api.cc
+    ${BISON_cuobjdump_parser_OUTPUT_SOURCE} ${FLEX_cuobjdump_lexer_OUTPUTS})
+
+    # Add current dir to include path
+# Also add flex/bison generated header files
+target_include_directories(cuda PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+
+# Add cuda include path for own reference
+target_include_directories(cuda PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+
+# Add project build dir to include path
+target_include_directories(cuda PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/libcuda/cuda_api_object.h b/libcuda/cuda_api_object.h
index d292e224e..e620e5728 100644
--- a/libcuda/cuda_api_object.h
+++ b/libcuda/cuda_api_object.h
@@ -1,6 +1,7 @@
 #ifndef __cuda_api_object_h__
 #define __cuda_api_object_h__
 
+#include <functional>
 #include <list>
 #include <map>
 #include <set>
@@ -193,9 +194,25 @@ class cuda_runtime_api {
   // backward pointer
   class gpgpu_context *gpgpu_ctx;
   // member function list
+
+  // For SST and other potential simulator interface
+  void cuobjdumpInit(const char *fn);
+  void extract_code_using_cuobjdump(const char *fn);
+  void extract_ptx_files_using_cuobjdump(CUctx_st *context, const char *fn);
+
+  // For running GPGPUSim alone
   void cuobjdumpInit();
   void extract_code_using_cuobjdump();
   void extract_ptx_files_using_cuobjdump(CUctx_st *context);
+
+  // Internal functions for the above public methods
+  void cuobjdumpInit_internal(std::function<void()> ctx_extract_code_func);
+  void extract_code_using_cuobjdump_internal(
+      CUctx_st *context, std::string &app_binary,
+      std::function<void(CUctx_st *)> ctx_extract_ptx_func);
+  void extract_ptx_files_using_cuobjdump_internal(CUctx_st *context,
+                                                  std::string &app_binary);
+
   std::list<cuobjdumpSection *> pruneSectionList(CUctx_st *context);
   std::list<cuobjdumpSection *> mergeMatchingSections(std::string identifier);
   std::list<cuobjdumpSection *> mergeSections();
diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc
index fd05f555c..5dfd3fc38 100644
--- a/libcuda/cuda_runtime_api.cc
+++ b/libcuda/cuda_runtime_api.cc
@@ -109,6 +109,7 @@
 #include <string.h>
 #include <time.h>
 #include <fstream>
+#include <functional>
 #include <iostream>
 #include <regex>
 #include <sstream>
@@ -133,16 +134,16 @@
 #if (CUDART_VERSION < 8000)
 #include "__cudaFatFormat.h"
 #endif
-#include "gpgpu_context.h"
-#include "cuda_api_object.h"
-#include "../src/gpgpu-sim/gpu-sim.h"
-#include "../src/cuda-sim/ptx_loader.h"
+#include "../src/abstract_hardware_model.h"
 #include "../src/cuda-sim/cuda-sim.h"
 #include "../src/cuda-sim/ptx_ir.h"
+#include "../src/cuda-sim/ptx_loader.h"
 #include "../src/cuda-sim/ptx_parser.h"
+#include "../src/gpgpu-sim/gpu-sim.h"
 #include "../src/gpgpusim_entrypoint.h"
 #include "../src/stream_manager.h"
-#include "../src/abstract_hardware_model.h"
+#include "cuda_api_object.h"
+#include "gpgpu_context.h"
 
 #include <pthread.h>
 #include <semaphore.h>
@@ -151,6 +152,9 @@
 #include <mach-o/dyld.h>
 #endif
 
+// SST cycle
+extern bool SST_Cycle();
+
 /*DEVICE_BUILTIN*/
 struct cudaArray {
   void *devPtr;
@@ -412,6 +416,13 @@ void setCuobjdumpsassfilename(
 //! processes (e.g. cuobjdump) reading /proc/<pid>/exe will see the emulator
 //! executable instead of the application binary.
 //!
+// In SST need the string to pass the binary information
+// as we cannot get it from /proc/self/exe
+std::string get_app_binary(const char *fn) {
+  printf("self exe links to: %s\n", fn);
+  return fn;
+}
+
 std::string get_app_binary() {
   char self_exe_path[1025];
 #ifdef __APPLE__
@@ -435,7 +446,7 @@ std::string get_app_binary() {
 
 // above func gives abs path whereas this give just the name of application.
 char *get_app_binary_name(std::string abs_path) {
-  char *self_exe_path;
+  char *self_exe_path = NULL;
 #ifdef __APPLE__
   // TODO: get apple device and check the result.
   printf("WARNING: not tested for Apple-mac devices \n");
@@ -453,17 +464,27 @@ char *get_app_binary_name(std::string abs_path) {
   return self_exe_path;
 }
 
-static int get_app_cuda_version() {
+static int get_app_cuda_version_internal(std::string app_binary) {
   int app_cuda_version = 0;
   char fname[1024];
   snprintf(fname, 1024, "_app_cuda_version_XXXXXX");
   int fd = mkstemp(fname);
   close(fd);
+  // Weili: Add way to extract CUDA version information from Balar Vanadis
+  // binary (stored as a const string)
   std::string app_cuda_version_command =
-      "ldd " + get_app_binary() +
+      "ldd " + app_binary +
       " | grep libcudart.so | sed  's/.*libcudart.so.\\(.*\\) =>.*/\\1/' > " +
+      fname + " && strings " + app_binary +
+      " | grep libcudart_vanadis.a | sed  "
+      "'s/.*libcudart_vanadis.a.\\(.*\\)/\\1/' >> " +
       fname;
-  system(app_cuda_version_command.c_str());
+  int res = system(app_cuda_version_command.c_str());
+  if (res == -1) {
+    printf("Error - Cannot detect the app's CUDA version. Command: %s\n",
+           app_cuda_version_command.c_str());
+    exit(1);
+  }
   FILE *cmd = fopen(fname, "r");
   char buf[256];
   while (fgets(buf, sizeof(buf), cmd) != 0) {
@@ -472,12 +493,24 @@ static int get_app_cuda_version() {
   }
   fclose(cmd);
   if (app_cuda_version == 0) {
-    printf("Error - Cannot detect the app's CUDA version.\n");
+    printf("Error - Cannot detect the app's CUDA version. Command: %s\n",
+           app_cuda_version_command.c_str());
     exit(1);
   }
   return app_cuda_version;
 }
 
+static int get_app_cuda_version(const char *fn) {
+  // Use for other simulator integration
+  std::string app_binary = get_app_binary(fn);
+  return get_app_cuda_version_internal(app_binary);
+}
+
+static int get_app_cuda_version() {
+  std::string app_binary = get_app_binary();
+  return get_app_cuda_version_internal(app_binary);
+}
+
 //! Keep track of the association between filename and cubin handle
 void cuda_runtime_api::cuobjdumpRegisterFatBinary(unsigned int handle,
                                                   const char *filename,
@@ -570,8 +603,11 @@ __host__ cudaError_t CUDARTAPI cudaDeviceGetLimitInternal(
   return g_last_cudaError = cudaSuccess;
 }
 
-void **cudaRegisterFatBinaryInternal(void *fatCubin,
-                                     gpgpu_context *gpgpu_ctx = NULL) {
+// Internal implementation for cudaRegisterFatBiaryInternal
+void **cudaRegisterFatBiaryInternal_impl(
+    void *fatCubin, gpgpu_context *gpgpu_ctx, std::string &app_binary_path,
+    int app_cuda_version,
+    std::function<void(gpgpu_context *)> ctx_cuobjdumpInit_func) {
   gpgpu_context *ctx;
   if (gpgpu_ctx) {
     ctx = gpgpu_ctx;
@@ -602,11 +638,9 @@ void **cudaRegisterFatBinaryInternal(void *fatCubin,
     // compiled with a newer version of CUDA to run apps compiled with older
     // versions of CUDA. This is especially useful for PTXPLUS execution.
     // Skip cuda version check for pytorch application
-    std::string app_binary_path = get_app_binary();
     int pos = app_binary_path.find("python");
     if (pos == std::string::npos) {
       // Not pytorch app : checking cuda version
-      int app_cuda_version = get_app_cuda_version();
       assert(
           app_cuda_version == CUDART_VERSION / 1000 &&
           "The app must be compiled with same major version as the simulator.");
@@ -657,7 +691,7 @@ void **cudaRegisterFatBinaryInternal(void *fatCubin,
      * then for next calls, only returns the appropriate number
      */
     assert(fat_cubin_handle >= 1);
-    if (fat_cubin_handle == 1) ctx->api->cuobjdumpInit();
+    if (fat_cubin_handle == 1) ctx_cuobjdumpInit_func(ctx);
     ctx->api->cuobjdumpRegisterFatBinary(fat_cubin_handle, filename, context);
 
     return (void **)fat_cubin_handle;
@@ -749,6 +783,28 @@ void **cudaRegisterFatBinaryInternal(void *fatCubin,
 #endif
 }
 
+void **cudaRegisterFatBinaryInternal(const char *fn, void *fatCubin,
+                                     gpgpu_context *gpgpu_ctx = NULL) {
+  std::string app_binary_path = get_app_binary(fn);
+  int app_cuda_version = get_app_cuda_version(fn);
+  auto ctx_cuobjdumpInit = [=](gpgpu_context *ctx) {
+    ctx->api->cuobjdumpInit(fn);
+  };
+  return cudaRegisterFatBiaryInternal_impl(fatCubin, gpgpu_ctx, app_binary_path,
+                                           app_cuda_version, ctx_cuobjdumpInit);
+}
+
+void **cudaRegisterFatBinaryInternal(void *fatCubin,
+                                     gpgpu_context *gpgpu_ctx = NULL) {
+  std::string app_binary_path = get_app_binary();
+  int app_cuda_version = get_app_cuda_version();
+  auto ctx_cuobjdumpInit = [](gpgpu_context *ctx) {
+    ctx->api->cuobjdumpInit();
+  };
+  return cudaRegisterFatBiaryInternal_impl(fatCubin, gpgpu_ctx, app_binary_path,
+                                           app_cuda_version, ctx_cuobjdumpInit);
+}
+
 void cudaRegisterFunctionInternal(void **fatCubinHandle, const char *hostFun,
                                   char *deviceFun, const char *deviceName,
                                   int thread_limit, uint3 *tid, uint3 *bid,
@@ -1053,6 +1109,24 @@ cudaError_t cudaMallocHostInternal(void **ptr, size_t size,
   }
 }
 
+// SST malloc done by vanadis, we just need to record the memory addr
+cudaError_t CUDARTAPI cudaMallocHostSSTInternal(
+    void *addr, size_t size, gpgpu_context *gpgpu_ctx = NULL) {
+  gpgpu_context *ctx;
+  if (gpgpu_ctx) {
+    ctx = gpgpu_ctx;
+  } else {
+    ctx = GPGPU_Context();
+  }
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+  // track pinned memory size allocated in the host so that same amount of
+  // memory is also allocated in GPU.
+  ctx->api->pinned_memory_size[addr] = size;
+  return g_last_cudaError = cudaSuccess;
+}
+
 __host__ cudaError_t CUDARTAPI
 cudaMallocPitchInternal(void **devPtr, size_t *pitch, size_t width,
                         size_t height, gpgpu_context *gpgpu_ctx = NULL) {
@@ -1410,14 +1484,16 @@ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlagsInternal(
   function_info *entry = context->get_kernel(hostFunc);
   printf(
       "Calculate Maxium Active Block with function ptr=%p, blockSize=%d, "
-      "SMemSize=%d\n",
+      "SMemSize=%lu\n",
       hostFunc, blockSize, dynamicSMemSize);
   if (flags == cudaOccupancyDefault) {
     // create kernel_info based on entry
     dim3 gridDim(context->get_device()->get_gpgpu()->max_cta_per_core() *
                  context->get_device()->get_gpgpu()->get_config().num_shader());
     dim3 blockDim(blockSize);
-    kernel_info_t result(gridDim, blockDim, entry);
+    // because this fuction is only checking for resource requirements, we do
+    // not care which stream this kernel runs at, just picked -1
+    kernel_info_t result(gridDim, blockDim, entry, -1);
     // if(entry == NULL){
     //	*numBlocks = 1;
     //	return g_last_cudaError = cudaErrorUnknown;
@@ -2295,13 +2371,77 @@ cudaDeviceSynchronizeInternal(gpgpu_context *gpgpu_ctx = NULL) {
  *                                                                              *
  *******************************************************************************/
 
-extern "C" {
-
 /*******************************************************************************
  *                                                                              *
- *                                                                              *
+ *   SST Specific functions, used by Balar *
  *                                                                              *
  *******************************************************************************/
+
+/**
+ * @brief Custom function to get CUDA function parameter size and offset
+ *        from PTX parsing result
+ *
+ * @param hostFun
+ * @param index
+ * @return std::tuple<cudaError_t, size_t, unsigned>
+ */
+std::tuple<cudaError_t, size_t, unsigned> SST_cudaGetParamConfig(
+    uint64_t hostFun, unsigned index) {
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+  CUctx_st *context = GPGPUSim_Context(GPGPU_Context());
+  function_info *entry = context->get_kernel((char *)hostFun);
+  cudaError_t result = cudaSuccess;
+  size_t size = 0;
+  unsigned alignment = 0;
+  if (index >= entry->num_args()) {
+    result = cudaErrorAssert;
+  } else {
+    std::pair<size_t, unsigned> p = entry->get_param_config(index);
+    size = p.first;
+    alignment = p.second;
+  }
+  return std::tuple<cudaError_t, size_t, unsigned>(result, size, alignment);
+}
+
+extern "C" {
+void SST_receive_mem_reply(unsigned core_id, void *mem_req) {
+  CUctx_st *context = GPGPUSim_Context(GPGPU_Context());
+  static_cast<sst_gpgpu_sim *>(context->get_device()->get_gpgpu())
+      ->SST_receive_mem_reply(core_id, mem_req);
+  // printf("GPGPU-sim: Recived Request\n");
+}
+
+bool SST_gpu_core_cycle() { return SST_Cycle(); }
+
+void SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) {
+  CUctx_st *context = GPGPUSim_Context(GPGPU_Context());
+  static_cast<sst_gpgpu_sim *>(context->get_device()->get_gpgpu())
+      ->SST_gpgpusim_numcores_equal_check(sst_numcores);
+}
+
+uint64_t cudaMallocSST(void **devPtr, size_t size) {
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+  void *test_malloc;
+  test_malloc = (void *)malloc(size);
+  void **test_malloc2 = &test_malloc;
+  CUctx_st *context = GPGPUSim_Context(GPGPU_Context());
+  *test_malloc2 = context->get_device()->get_gpgpu()->gpu_malloc(size);
+  printf("GPGPU-Sim PTX: cudaMallocing %zu bytes starting at 0x%llx..\n", size,
+         (unsigned long long)*test_malloc2);
+  if (g_debug_execution >= 3)
+    printf("GPGPU-Sim PTX: cudaMallocing %zu bytes starting at 0x%llx..\n",
+           size, (unsigned long long)*test_malloc2);
+  return (uint64_t)*test_malloc2;
+}
+
+__host__ cudaError_t CUDARTAPI cudaMallocHostSST(void *addr, size_t size) {
+  return cudaMallocHostSSTInternal(addr, size);
+}
+
 cudaError_t cudaPeekAtLastError(void) { return g_last_cudaError; }
 
 __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size) {
@@ -2528,6 +2668,7 @@ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
  *                                                                              *
  *                                                                              *
  *******************************************************************************/
+
 __host__ cudaError_t CUDARTAPI cudaMemset(void *mem, int c, size_t count) {
   return cudaMemsetInternal(mem, c, count);
 }
@@ -2748,11 +2889,32 @@ __host__ const char *CUDARTAPI cudaGetErrorString(cudaError_t error) {
   return strdup(buf);
 }
 
+// SST specific cuda apis
+__host__ cudaError_t CUDARTAPI cudaSetupArgumentSST(uint64_t arg,
+                                                    uint8_t value[200],
+                                                    size_t size,
+                                                    size_t offset) {
+  void *local_value;
+  local_value = (void *)malloc(size);
+
+  if (arg) {
+    memcpy(local_value, (void *)&arg, size);
+  } else {
+    memcpy(local_value, value, size);
+  }
+  return cudaSetupArgumentInternal(local_value, size, offset);
+}
+
 __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, size_t size,
                                                  size_t offset) {
   return cudaSetupArgumentInternal(arg, size, offset);
 }
 
+// SST specific cuda apis
+__host__ cudaError_t CUDARTAPI cudaLaunchSST(uint64_t hostFun) {
+  return cudaLaunchInternal((char *)hostFun);
+}
+
 __host__ cudaError_t CUDARTAPI cudaLaunch(const char *hostFun) {
   return cudaLaunchInternal(hostFun);
 }
@@ -2927,6 +3089,27 @@ __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void) {
   return cudaThreadSynchronizeInternal();
 }
 
+__host__ cudaError_t CUDARTAPI cudaThreadSynchronizeSST(void) {
+  // For SST, perform a one-time check and let SST_Cycle()
+  // do the polling test and invoke callback to SST
+  // to signal ThreadSynchonize done
+  gpgpu_context *ctx = GPGPU_Context();
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+
+  // Called on host side
+  bool thread_sync_done = ctx->synchronize_check();
+  g_last_cudaError = cudaSuccess;
+  if (thread_sync_done) {
+    // We are already done, so no need to poll for sync done
+    ctx->requested_synchronize = false;
+    return cudaSuccess;
+  } else {
+    return cudaErrorNotReady;
+  }
+}
+
 int CUDARTAPI __cudaSynchronizeThreads(void **, void *) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
@@ -2986,10 +3169,10 @@ __host__ cudaError_t CUDARTAPI cudaGetExportTable(
 
 // extracts all ptx files from binary and dumps into
 // prog_name.unique_no.sm_<>.ptx files
-void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) {
+void cuda_runtime_api::extract_ptx_files_using_cuobjdump_internal(
+    CUctx_st *context, std::string &app_binary) {
   char command[1000];
   char *pytorch_bin = getenv("PYTORCH_BIN");
-  std::string app_binary = get_app_binary();
 
   char ptx_list_file_name[1024];
   snprintf(ptx_list_file_name, 1024, "_cuobjdump_list_ptx_XXXXXX");
@@ -3056,6 +3239,17 @@ void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) {
   }
 }
 
+void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context,
+                                                         const char *fn) {
+  std::string app_binary = get_app_binary(fn);
+  this->extract_ptx_files_using_cuobjdump_internal(context, app_binary);
+}
+
+void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) {
+  std::string app_binary = get_app_binary();
+  this->extract_ptx_files_using_cuobjdump_internal(context, app_binary);
+}
+
 //! Call cuobjdump to extract everything (-elf -sass -ptx)
 /*!
  *	This Function extract the whole PTX (for all the files) using cuobjdump
@@ -3063,13 +3257,12 @@ void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) {
  *with each binary in its own file It is also responsible for extracting the
  *libraries linked to the binary if the option is enabled
  * */
-void cuda_runtime_api::extract_code_using_cuobjdump() {
-  CUctx_st *context = GPGPUSim_Context(gpgpu_ctx);
-
+void cuda_runtime_api::extract_code_using_cuobjdump_internal(
+    CUctx_st *context, std::string &app_binary,
+    std::function<void(CUctx_st *)> ctx_extract_ptx_func) {
   // prevent the dumping by cuobjdump everytime we execute the code!
   const char *override_cuobjdump = getenv("CUOBJDUMP_SIM_FILE");
   char command[1000];
-  std::string app_binary = get_app_binary();
   // Running cuobjdump using dynamic link to current process
   snprintf(command, 1000, "md5sum %s ", app_binary.c_str());
   printf("Running md5sum using \"%s\"\n", command);
@@ -3084,7 +3277,7 @@ void cuda_runtime_api::extract_code_using_cuobjdump() {
   // used by ptxas.
   int result = 0;
 #if (CUDART_VERSION >= 6000)
-  extract_ptx_files_using_cuobjdump(context);
+  ctx_extract_ptx_func(context);
   return;
 #endif
   // TODO: redundant to dump twice. how can it be prevented?
@@ -3216,6 +3409,26 @@ void cuda_runtime_api::extract_code_using_cuobjdump() {
   }
 }
 
+void cuda_runtime_api::extract_code_using_cuobjdump(const char *fn) {
+  CUctx_st *context = GPGPUSim_Context(gpgpu_ctx);
+  std::string app_binary = get_app_binary(fn);
+  auto ctx_extract_ptx_func = [=](CUctx_st *context) {
+    extract_ptx_files_using_cuobjdump(context, fn);
+  };
+  extract_code_using_cuobjdump_internal(context, app_binary,
+                                        ctx_extract_ptx_func);
+}
+
+void cuda_runtime_api::extract_code_using_cuobjdump() {
+  CUctx_st *context = GPGPUSim_Context(gpgpu_ctx);
+  std::string app_binary = get_app_binary();
+  auto ctx_extract_ptx_func = [=](CUctx_st *context) {
+    extract_ptx_files_using_cuobjdump(context);
+  };
+  extract_code_using_cuobjdump_internal(context, app_binary,
+                                        ctx_extract_ptx_func);
+}
+
 //! Read file into char*
 // TODO: convert this to C++ streams, will be way cleaner
 char *readfile(const std::string filename) {
@@ -3234,7 +3447,12 @@ char *readfile(const std::string filename) {
   fseek(fp, 0, SEEK_SET);
   // allocate and copy the entire ptx
   char *ret = (char *)malloc((filesize + 1) * sizeof(char));
-  fread(ret, 1, filesize, fp);
+  int num = fread(ret, 1, filesize, fp);
+  if (num == 0) {
+    std::cout << "ERROR: Could not read data from file %s\n"
+              << filename << std::endl;
+    assert(0);
+  }
   ret[filesize] = '\0';
   fclose(fp);
   return ret;
@@ -3455,10 +3673,11 @@ cuobjdumpPTXSection *cuda_runtime_api::findPTXSection(
 }
 
 //! Extract the code using cuobjdump and remove unnecessary sections
-void cuda_runtime_api::cuobjdumpInit() {
+void cuda_runtime_api::cuobjdumpInit_internal(
+    std::function<void()> ctx_extract_code_func) {
   CUctx_st *context = GPGPUSim_Context(gpgpu_ctx);
-  extract_code_using_cuobjdump();  // extract all the output of cuobjdump to
-                                   // _cuobjdump_*.*
+  ctx_extract_code_func();  // extract all the output of cuobjdump to
+                            // _cuobjdump_*.*
   const char *pre_load = getenv("CUOBJDUMP_SIM_FILE");
   if (pre_load == NULL || strlen(pre_load) == 0) {
     cuobjdumpSectionList = pruneSectionList(context);
@@ -3466,6 +3685,16 @@ void cuda_runtime_api::cuobjdumpInit() {
   }
 }
 
+void cuda_runtime_api::cuobjdumpInit(const char *fn) {
+  auto ctx_extract_code_func = [=]() { extract_code_using_cuobjdump(fn); };
+  cuobjdumpInit_internal(ctx_extract_code_func);
+}
+
+void cuda_runtime_api::cuobjdumpInit() {
+  auto ctx_extract_code_func = [=]() { extract_code_using_cuobjdump(); };
+  cuobjdumpInit_internal(ctx_extract_code_func);
+}
+
 //! Either submit PTX for simulation or convert SASS to PTXPlus and submit it
 void gpgpu_context::cuobjdumpParseBinary(unsigned int handle) {
   CUctx_st *context = GPGPUSim_Context(this);
@@ -3478,7 +3707,7 @@ void gpgpu_context::cuobjdumpParseBinary(unsigned int handle) {
     context->add_binary(symtab, handle);
     return;
   }
-  symbol_table *symtab;
+  symbol_table *symtab = NULL;
 
 #if (CUDART_VERSION >= 6000)
   // loops through all ptx files from smallest sm version to largest
@@ -3576,6 +3805,10 @@ void gpgpu_context::cuobjdumpParseBinary(unsigned int handle) {
 
 extern "C" {
 
+void **CUDARTAPI __cudaRegisterFatBinarySST(const char *fn) {
+  return cudaRegisterFatBinaryInternal(fn, NULL);
+}
+
 void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
@@ -3596,6 +3829,7 @@ unsigned CUDARTAPI __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
     announce_call(__my_func__);
   }
   cudaConfigureCallInternal(gridDim, blockDim, sharedMem, stream);
+  return 0;
 }
 
 cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim, dim3 *blockDim,
@@ -3607,6 +3841,14 @@ cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim, dim3 *blockDim,
   return g_last_cudaError = cudaSuccess;
 }
 
+void CUDARTAPI __cudaRegisterFunctionSST(unsigned fatCubinHandle,
+                                         uint64_t hostFun,
+                                         char deviceFun[512]) {
+  cudaRegisterFunctionInternal((void **)fatCubinHandle, (const char *)hostFun,
+                               (char *)deviceFun, NULL, NULL, NULL, NULL, NULL,
+                               NULL);
+}
+
 void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
                                       const char *hostFun, char *deviceFun,
                                       const char *deviceName, int thread_limit,
diff --git a/libcuda/gpgpu_context.h b/libcuda/gpgpu_context.h
index d0cd7c48c..5ef21177b 100644
--- a/libcuda/gpgpu_context.h
+++ b/libcuda/gpgpu_context.h
@@ -44,6 +44,9 @@ class gpgpu_context {
       s_g_pc_to_insn;  // a direct mapping from PC to instruction
   bool debug_tensorcore;
 
+  // SST related
+  bool requested_synchronize = false;
+
   // objects pointers for each file
   cuda_runtime_api *api;
   ptxinfo_data *ptxinfo;
@@ -54,6 +57,7 @@ class gpgpu_context {
   ptx_stats *stats;
   // member function list
   void synchronize();
+  bool synchronize_check();
   void exit_simulation();
   void print_simulation_time();
   int gpgpu_opencl_ptx_sim_main_perf(kernel_info_t *grid);
diff --git a/libopencl/CMakeLists.txt b/libopencl/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/setup_environment b/setup_environment
index 07d078844..2fac1b991 100644
--- a/setup_environment
+++ b/setup_environment
@@ -7,7 +7,7 @@ export GPGPUSIM_ROOT="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )"
 
 GPGPUSIM_VERSION_STRING=`cat $GPGPUSIM_ROOT/version | awk '/Version/ {print $8}'`
 #Detect Git branch and commit #
-GIT_COMMIT=`git --git-dir=$GPGPUSIM_ROOT/.git log -n 1 | head -1 | sed -re 's/commit (.*)/\1/'`
+GIT_COMMIT=`git --git-dir=$GPGPUSIM_ROOT/.git log --abbrev-commit -n 1 | head -1 | sed -re 's/commit (.*)/\1/'`
 GIT_FILES_CHANGED=`git --git-dir=$GPGPUSIM_ROOT/.git diff --numstat | wc | sed -re 's/^\s+([0-9]+).*/\1./'`
 GIT_FILES_CHANGED+=`git --git-dir=$GPGPUSIM_ROOT/.git diff --numstat --cached | wc | sed -re 's/^\s+([0-9]+).*/\1/'`
 GPGPUSIM_BUILD_STRING="gpgpu-sim_git-commit-$GIT_COMMIT-modified_$GIT_FILES_CHANGED"
@@ -16,17 +16,17 @@ echo -n "GPGPU-Sim version $GPGPUSIM_VERSION_STRING (build $GPGPUSIM_BUILD_STRIN
 
 if [ ! -n "$CUDA_INSTALL_PATH" ]; then
 	echo "ERROR ** Install CUDA Toolkit and set CUDA_INSTALL_PATH.";
-	return;
+	return 1;
 fi
 
 if [ ! -d "$CUDA_INSTALL_PATH" ]; then
 	echo "ERROR ** CUDA_INSTALL_PATH=$CUDA_INSTALL_PATH invalid (directory does not exist)";
-	return;
+	return 1;
 fi
 
 if [ ! `uname` = "Linux" -a  ! `uname` = "Darwin" ]; then
 	echo "ERROR ** Unsupported platform: GPGPU-Sim $GPGPUSIM_VERSION_STRING developed and tested on Linux."
-	return;
+	return 1;
 fi
 
 export PATH=`echo $PATH | sed "s#$GPGPUSIM_ROOT/bin:$CUDA_INSTALL_PATH/bin:##"`
@@ -41,17 +41,17 @@ if [ $? = 1 ]; then
 	echo "         Try adding $CUDA_INSTALL_PATH/bin/ to your PATH environment variable.";
 	echo "         Please also be sure to read the README file if you have not done so.";
 	echo "";
-	return;
+	return 1;
 fi
 
-CC_VERSION=`gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($i,/^[0-9]\.[0-9]\.[0-9]$/))  {print $i; exit 0}}}'`
+CC_VERSION=$(gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($i,/^[0-9]+\.[0-9]+\.[0-9]+$/)) {print $i; exit 0}}}')
 
 CUDA_VERSION_STRING=`$CUDA_INSTALL_PATH/bin/nvcc --version | awk '/release/ {print $5;}' | sed 's/,//'`;
 export CUDA_VERSION_NUMBER=`echo $CUDA_VERSION_STRING | sed 's/\./ /' | awk '{printf("%02u%02u", 10*int($1), 10*$2);}'`
 if [ $CUDA_VERSION_NUMBER -gt 11100 -o $CUDA_VERSION_NUMBER -lt 2030  ]; then
 	echo "ERROR ** GPGPU-Sim version $GPGPUSIM_VERSION_STRING not tested with CUDA version $CUDA_VERSION_STRING (please see README)";
 	echo $CUDA_VERSION_NUMBER
-  return
+    return 1;
 fi
 
 if [ $CUDA_VERSION_NUMBER -ge 6000 ]; then
@@ -60,13 +60,18 @@ if [ $CUDA_VERSION_NUMBER -ge 6000 ]; then
 	export CUOBJDUMP_SIM_FILE=jj
 fi
 
+# Simple configure, loop through all positional arguments
+# Default config
+export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/release
 
-if [ $# = '1' ] ;
-then
-    export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/$1
-else
-    export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/release
-fi
+for opt in $@
+do
+	if [[ $opt == 'debug' ]] ; then
+		# Debug mode
+		echo -n "enabled debug mode "
+		export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/$1
+	fi
+done
 
 export QTINC=/usr/include
 
@@ -117,25 +122,25 @@ fi
 
 # The following checks to see if the GPGPU-Sim power model is enabled.
 # GPGPUSIM_POWER_MODEL points to the directory where gpgpusim_mcpat is located.
-# If this is not set, it checks the default directory "$GPGPUSIM_ROOT/src/gpuwattch/".
-if [ -d $GPGPUSIM_ROOT/src/gpuwattch/ ]; then
-	if [ ! -f $GPGPUSIM_ROOT/src/gpuwattch/gpgpu_sim.verify ]; then
-		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/gpuwattch";
-		return;
+# If this is not set, it checks the default directory "$GPGPUSIM_ROOT/src/accelwattch/".
+if [ -d $GPGPUSIM_ROOT/src/accelwattch/ ]; then
+	if [ ! -f $GPGPUSIM_ROOT/src/accelwattch/gpgpu_sim.verify ]; then
+		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/accelwattch";
+		return 1;
 	fi
-	export GPGPUSIM_POWER_MODEL=$GPGPUSIM_ROOT/src/gpuwattch/;
-	echo "configured with GPUWattch.";
+	export GPGPUSIM_POWER_MODEL=$GPGPUSIM_ROOT/src/accelwattch/;
+	echo "configured with AccelWattch.";
 elif [ -n "$GPGPUSIM_POWER_MODEL" ]; then
 	if [ ! -f $GPGPUSIM_POWER_MODEL/gpgpu_sim.verify ]; then
 		echo "";
-		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/gpuwattch/ - Either incorrect directory or incorrect McPAT version";
-		return;
+		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/accelwattch/ - Either incorrect directory or incorrect McPAT version";
+		return 1;
 	fi
 	echo "configure with power model in $GPGPUSIM_POWER_MODEL.";
 elif [ ! -d $GPGPUSIM_POWER_MODEL ]; then
 		echo "";
 		echo "ERROR ** GPGPUSIM_POWER_MODEL ($GPGPUSIM_POWER_MODEL) does not exist... Please set this to the gpgpusim_mcpat directory or unset this environment variable.";
-		return;
+		return 1;
 else
 	echo "configured without a power model.";
 fi
@@ -143,21 +148,26 @@ fi
 if [ -z "$PTXAS_CUDA_INSTALL_PATH" ]; then
     export PTXAS_CUDA_INSTALL_PATH=$CUDA_INSTALL_PATH;
 fi
-echo "";
-echo "----------------------------------------------------------------------------";
-echo "INFO - If you only care about PTX execution, ignore this message. GPGPU-Sim supports PTX execution in modern CUDA."
-echo "If you want to run PTXPLUS (sm_1x SASS) with a modern card configuration - set the envronment variable"
-echo "\$PTXAS_CUDA_INSTALL_PATH to point a CUDA version compabible with your card configurations (i.e. 8+ for PASCAL, 9+ for VOLTA etc..)"
-echo "For example: \"export \$PTXAS_CUDA_INSTALL_PATH=/usr/local/cuda-9.1\""
-echo ""
-echo "The following text describes why:";
-echo "If you are using PTXPLUS, only sm_1x is supported and it requires that the app and simulator binaries are compiled in CUDA 4.2 or less.";
-echo "The simulator requires it since CUDA headers desribe struct sizes in the exec which change from gen to gen.";
-echo "The apps require 4.2 because new versions of CUDA tools have dropped parsing support for generating sm_1x";
-echo "When running using modern config (i.e. volta) and PTXPLUS with CUDA 4.2, the \$PTXAS_CUDA_INSTALL_PATH env variable is required to get proper register usage"
-echo "(and hence occupancy) using a version of CUDA that knows the register usage on the real card."
-echo "";
-echo "----------------------------------------------------------------------------";
+
+# I am not sure PTXPlus really makes sense anymore and this verbose print to describe
+# how to use it is probably not aging well. The info in here is good though if you care
+# about PTXPlus, so I will leave it as a comment.
+#
+#echo "";
+#echo "----------------------------------------------------------------------------";
+#echo "INFO - If you only care about PTX execution or trace-based SASS execution, ignore this message."
+#echo "If you want to run PTXPLUS (sm_1x SASS) with a modern card configuration - set the envronment variable"
+#echo "\$PTXAS_CUDA_INSTALL_PATH to point a CUDA version compabible with your card configurations (i.e. 8+ for PASCAL, 9+ for VOLTA etc..)"
+#echo "For example: \"export \$PTXAS_CUDA_INSTALL_PATH=/usr/local/cuda-9.1\""
+#echo ""
+#echo "The following text describes why:";
+#echo "If you are using PTXPLUS, only sm_1x is supported and it requires that the app and simulator binaries are compiled in CUDA 4.2 or less.";
+#echo "The simulator requires it since CUDA headers desribe struct sizes in the exec which change from gen to gen.";
+#echo "The apps require 4.2 because new versions of CUDA tools have dropped parsing support for generating sm_1x";
+#echo "When running using modern config (i.e. volta) and PTXPLUS with CUDA 4.2, the \$PTXAS_CUDA_INSTALL_PATH env variable is required to get proper register usage"
+#echo "(and hence occupancy) using a version of CUDA that knows the register usage on the real card."
+#echo "";
+#echo "----------------------------------------------------------------------------";
 
 echo "setup_environment succeeded";
 
diff --git a/short-tests-cmake.sh b/short-tests-cmake.sh
new file mode 100755
index 000000000..e41444156
--- /dev/null
+++ b/short-tests-cmake.sh
@@ -0,0 +1,27 @@
+if [ ! -n "$CUDA_INSTALL_PATH" ]; then
+	echo "ERROR ** Install CUDA Toolkit and set CUDA_INSTALL_PATH.";
+	exit;
+fi
+
+if [ ! -n "$CONFIG" ]; then
+	echo "ERROR ** set the CONFIG env variable to one of those found in ./accel-sim-framework/util/job_launching/configs/define-standard-cfgs.yml";
+	exit;
+fi
+
+if [ ! -n "$GPUAPPS_ROOT" ]; then
+	echo "ERROR ** GPUAPPS_ROOT to a location where the apps have been compiled";
+	exit;
+fi
+
+git config --system --add safe.directory '*'
+
+export PATH=$CUDA_INSTALL_PATH/bin:$PATH
+
+cmake -B build
+cmake --build build -j
+cmake --install build
+source setup
+
+git clone https://github.com/accel-sim/accel-sim-framework.git
+./accel-sim-framework/util/job_launching/run_simulations.py -C $CONFIG -B rodinia_2.0-ft -N regress -l local
+./accel-sim-framework/util/job_launching/monitor_func_test.py -v -N regress -j procman
diff --git a/travis.sh b/short-tests.sh
similarity index 94%
rename from travis.sh
rename to short-tests.sh
index bbdd19acf..44f265a96 100755
--- a/travis.sh
+++ b/short-tests.sh
@@ -13,12 +13,12 @@ if [ ! -n "$GPUAPPS_ROOT" ]; then
 	exit;
 fi
 
+git config --system --add safe.directory '*'
+
 export PATH=$CUDA_INSTALL_PATH/bin:$PATH
 source ./setup_environment
 make -j
 
-pip install psutil
-rm -rf accel-sim-framework
 git clone https://github.com/accel-sim/accel-sim-framework.git
 ./accel-sim-framework/util/job_launching/run_simulations.py -C $CONFIG -B rodinia_2.0-ft -N regress -l local
 ./accel-sim-framework/util/job_launching/monitor_func_test.py -v -N regress -j procman
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 000000000..5849629e8
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,19 @@
+# gpgpusim_entrypoint objects
+add_library(gpgpusim_entrypoint OBJECT
+    abstract_hardware_model.cc
+    debug.cc
+    gpgpusim_entrypoint.cc
+    option_parser.cc
+    statwrapper.cc
+    stream_manager.cc
+    trace.cc)
+
+# Add current folder and CUDA include to include path
+target_include_directories(gpgpusim_entrypoint PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(gpgpusim_entrypoint PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+
+# Add subdir
+add_subdirectory(accelwattch)
+add_subdirectory(cuda-sim)
+add_subdirectory(gpgpu-sim)
+add_subdirectory(intersim2)
\ No newline at end of file
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index 5ad6f105d..e8ddf95ab 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -1,18 +1,21 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Inderpreet Singh, Timothy Rogers,
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Timothy Rogers,
+// Vijay Kandiah, Nikos Hardavellas, Mahmoud Khairy, Junrui Pan, Timothy G.
+// Rogers The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -46,12 +49,14 @@ void mem_access_t::init(gpgpu_context *ctx) {
   m_addr = 0;
   m_req_size = 0;
 }
+
 void warp_inst_t::issue(const active_mask_t &mask, unsigned warp_id,
                         unsigned long long cycle, int dynamic_warp_id,
-                        int sch_id) {
+                        int sch_id, unsigned long long streamID) {
   m_warp_active_mask = mask;
   m_warp_issued_mask = mask;
   m_uid = ++(m_config->gpgpu_ctx->warp_inst_sm_next_uid);
+  m_streamID = streamID;
   m_warp_id = warp_id;
   m_dynamic_warp_id = dynamic_warp_id;
   issue_cycle = cycle;
@@ -72,7 +77,7 @@ void checkpoint::load_global_mem(class memory_space *temp_mem, char *f1name) {
   FILE *fp2 = fopen(f1name, "r");
   assert(fp2 != NULL);
   char line[128]; /* or other suitable maximum line size */
-  unsigned int offset;
+  unsigned int offset = 0;
   while (fgets(line, sizeof line, fp2) != NULL) /* read a line */
   {
     unsigned int index;
@@ -205,8 +210,8 @@ gpgpu_t::gpgpu_t(const gpgpu_functional_sim_config &config, gpgpu_context *ctx)
   gpu_tot_sim_cycle = 0;
 }
 
-address_type line_size_based_tag_func(new_addr_type address,
-                                      new_addr_type line_size) {
+new_addr_type line_size_based_tag_func(new_addr_type address,
+                                       new_addr_type line_size) {
   // gives the tag for an address based on a given line size
   return address & ~(line_size - 1);
 }
@@ -288,6 +293,8 @@ void warp_inst_t::generate_mem_accesses() {
   const size_t starting_queue_size = m_accessq.size();
 
   assert(is_load() || is_store());
+
+  // if((space.get_type() != tex_space) && (space.get_type() != const_space))
   assert(m_per_scalar_thread_valid);  // need address information per thread
 
   bool is_write = is_store();
@@ -448,7 +455,8 @@ void warp_inst_t::generate_mem_accesses() {
     for (unsigned thread = 0; thread < m_config->warp_size; thread++) {
       if (!active(thread)) continue;
       new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[0];
-      unsigned block_address = line_size_based_tag_func(addr, cache_block_size);
+      new_addr_type block_address =
+          line_size_based_tag_func(addr, cache_block_size);
       accesses[block_address].set(thread);
       unsigned idx = addr - block_address;
       for (unsigned i = 0; i < data_size; i++) byte_mask.set(idx + i);
@@ -530,7 +538,8 @@ void warp_inst_t::memory_coalescing_arch(bool is_write,
            (m_per_scalar_thread[thread].memreqaddr[access] != 0);
            access++) {
         new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[access];
-        unsigned block_address = line_size_based_tag_func(addr, segment_size);
+        new_addr_type block_address =
+            line_size_based_tag_func(addr, segment_size);
         unsigned chunk =
             (addr & 127) / 32;  // which 32-byte chunk within in a 128-byte
                                 // chunk does this thread access?
@@ -552,7 +561,8 @@ void warp_inst_t::memory_coalescing_arch(bool is_write,
         if (block_address != line_size_based_tag_func(
                                  addr + data_size_coales - 1, segment_size)) {
           addr = addr + data_size_coales - 1;
-          unsigned block_address = line_size_based_tag_func(addr, segment_size);
+          new_addr_type block_address =
+              line_size_based_tag_func(addr, segment_size);
           unsigned chunk = (addr & 127) / 32;
           transaction_info &info = subwarp_transactions[block_address];
           info.chunks.set(chunk);
@@ -625,7 +635,8 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
       if (!active(thread)) continue;
 
       new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[0];
-      unsigned block_address = line_size_based_tag_func(addr, segment_size);
+      new_addr_type block_address =
+          line_size_based_tag_func(addr, segment_size);
       unsigned chunk =
           (addr & 127) / 32;  // which 32-byte chunk within in a 128-byte chunk
                               // does this thread access?
@@ -746,7 +757,8 @@ void warp_inst_t::completed(unsigned long long cycle) const {
 }
 
 kernel_info_t::kernel_info_t(dim3 gridDim, dim3 blockDim,
-                             class function_info *entry) {
+                             class function_info *entry,
+                             unsigned long long streamID) {
   m_kernel_entry = entry;
   m_grid_dim = gridDim;
   m_block_dim = blockDim;
@@ -756,6 +768,7 @@ kernel_info_t::kernel_info_t(dim3 gridDim, dim3 blockDim,
   m_next_tid = m_next_cta;
   m_num_cores_running = 0;
   m_uid = (entry->gpgpu_ctx->kernel_info_m_next_uid)++;
+  m_streamID = streamID;
   m_param_mem = new memory_space_impl<8192>("param", 64 * 1024);
 
   // Jin: parent and child kernel management for CDP
@@ -997,13 +1010,13 @@ void simt_stack::print(FILE *fout) const {
     }
     for (unsigned j = 0; j < m_warp_size; j++)
       fprintf(fout, "%c", (stack_entry.m_active_mask.test(j) ? '1' : '0'));
-    fprintf(fout, " pc: 0x%03x", stack_entry.m_pc);
+    fprintf(fout, " pc: 0x%03llx", stack_entry.m_pc);
     if (stack_entry.m_recvg_pc == (unsigned)-1) {
       fprintf(fout, " rp: ---- tp: %s cd: %2u ",
               (stack_entry.m_type == STACK_ENTRY_TYPE_CALL ? "C" : "N"),
               stack_entry.m_calldepth);
     } else {
-      fprintf(fout, " rp: %4u tp: %s cd: %2u ", stack_entry.m_recvg_pc,
+      fprintf(fout, " rp: %4llu tp: %s cd: %2u ", stack_entry.m_recvg_pc,
               (stack_entry.m_type == STACK_ENTRY_TYPE_CALL ? "C" : "N"),
               stack_entry.m_calldepth);
     }
@@ -1023,7 +1036,7 @@ void simt_stack::print_checkpoint(FILE *fout) const {
 
     for (unsigned j = 0; j < m_warp_size; j++)
       fprintf(fout, "%c ", (stack_entry.m_active_mask.test(j) ? '1' : '0'));
-    fprintf(fout, "%d %d %d %lld %d ", stack_entry.m_pc,
+    fprintf(fout, "%llu %d %llu %lld %d ", stack_entry.m_pc,
             stack_entry.m_calldepth, stack_entry.m_recvg_pc,
             stack_entry.m_branch_div_cycle, stack_entry.m_type);
     fprintf(fout, "%d %d\n", m_warp_id, m_warp_size);
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 49f3e9f90..98a403997 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1,18 +1,21 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Inderpreet Singh,
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Vijay Kandiah,
+// Nikos Hardavellas, Mahmoud Khairy, Junrui Pan, Timothy G. Rogers The
+// University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -59,13 +62,37 @@ enum _memory_space_t {
   instruction_space
 };
 
+#ifndef COEFF_STRUCT
+#define COEFF_STRUCT
+
+struct PowerscalingCoefficients {
+  double int_coeff;
+  double int_mul_coeff;
+  double int_mul24_coeff;
+  double int_mul32_coeff;
+  double int_div_coeff;
+  double fp_coeff;
+  double dp_coeff;
+  double fp_mul_coeff;
+  double fp_div_coeff;
+  double dp_mul_coeff;
+  double dp_div_coeff;
+  double sqrt_coeff;
+  double log_coeff;
+  double sin_coeff;
+  double exp_coeff;
+  double tensor_coeff;
+  double tex_coeff;
+};
+#endif
+
 enum FuncCache {
   FuncCachePreferNone = 0,
   FuncCachePreferShared = 1,
   FuncCachePreferL1 = 2
 };
 
-enum AdaptiveCache { FIXED = 0, ADAPTIVE_VOLTA = 1 };
+enum AdaptiveCache { FIXED = 0, ADAPTIVE_CACHE = 1 };
 
 #ifdef __cplusplus
 
@@ -75,8 +102,8 @@ enum AdaptiveCache { FIXED = 0, ADAPTIVE_VOLTA = 1 };
 
 typedef unsigned long long new_addr_type;
 typedef unsigned long long cudaTextureObject_t;
-typedef unsigned address_type;
-typedef unsigned addr_t;
+typedef unsigned long long address_type;
+typedef unsigned long long addr_t;
 
 // the following are operations the timing model can see
 #define SPECIALIZED_UNIT_NUM 8
@@ -134,8 +161,14 @@ enum special_operations_t {
   FP_SQRT_OP,
   FP_LG_OP,
   FP_SIN_OP,
-  FP_EXP_OP
+  FP_EXP_OP,
+  DP_MUL_OP,
+  DP_DIV_OP,
+  DP___OP,
+  TENSOR__OP,
+  TEX__OP
 };
+
 typedef enum special_operations_t
     special_ops;  // Required to identify for the power model
 enum operation_pipeline_t {
@@ -200,7 +233,8 @@ class kernel_info_t {
   //      m_num_cores_running=0;
   //      m_param_mem=NULL;
   //   }
-  kernel_info_t(dim3 gridDim, dim3 blockDim, class function_info *entry);
+  kernel_info_t(dim3 gridDim, dim3 blockDim, class function_info *entry,
+                unsigned long long streamID);
   kernel_info_t(
       dim3 gridDim, dim3 blockDim, class function_info *entry,
       std::map<std::string, const struct cudaArray *> nameToCudaArray,
@@ -259,6 +293,8 @@ class kernel_info_t {
            m_next_tid.x < m_block_dim.x;
   }
   unsigned get_uid() const { return m_uid; }
+  unsigned long long get_streamID() const { return m_streamID; }
+  std::string get_name() const { return name(); }
   std::string name() const;
 
   std::list<class ptx_thread_info *> &active_threads() {
@@ -291,7 +327,8 @@ class kernel_info_t {
 
   class function_info *m_kernel_entry;
 
-  unsigned m_uid;
+  unsigned m_uid;  // Kernel ID
+  unsigned long long m_streamID;
 
   // These maps contain the snapshot of the texture mappings at kernel launch
   std::map<std::string, const struct cudaArray *> m_NameToCudaArray;
@@ -373,6 +410,8 @@ class core_config {
   }
   unsigned mem_warp_parts;
   mutable unsigned gpgpu_shmem_size;
+  char *gpgpu_shmem_option;
+  std::vector<unsigned> shmem_opt_list;
   unsigned gpgpu_shmem_sizeDefault;
   unsigned gpgpu_shmem_sizePrefL1;
   unsigned gpgpu_shmem_sizePrefShared;
@@ -864,11 +903,19 @@ class mem_fetch_interface {
 class mem_fetch_allocator {
  public:
   virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
-                           unsigned size, bool wr,
-                           unsigned long long cycle) const = 0;
+                           unsigned size, bool wr, unsigned long long cycle,
+                           unsigned long long streamID) const = 0;
   virtual mem_fetch *alloc(const class warp_inst_t &inst,
                            const mem_access_t &access,
                            unsigned long long cycle) const = 0;
+  virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
+                           const active_mask_t &active_mask,
+                           const mem_access_byte_mask_t &byte_mask,
+                           const mem_access_sector_mask_t &sector_mask,
+                           unsigned size, bool wr, unsigned long long cycle,
+                           unsigned wid, unsigned sid, unsigned tpc,
+                           mem_fetch *original_mf,
+                           unsigned long long streamID) const = 0;
 };
 
 // the maximum number of destination, source, or address uarch operands in a
@@ -902,6 +949,7 @@ class inst_t {
     sp_op = OTHER_OP;
     op_pipe = UNKOWN_OP;
     mem_op = NOT_TEX;
+    const_cache_operand = 0;
     num_operands = 0;
     num_regs = 0;
     memset(out, 0, sizeof(unsigned));
@@ -920,7 +968,7 @@ class inst_t {
   }
   bool valid() const { return m_decoded; }
   virtual void print_insn(FILE *fp) const {
-    fprintf(fp, " [inst @ pc=0x%04x] ", pc);
+    fprintf(fp, " [inst @ pc=0x%04llx] ", pc);
   }
   bool is_load() const {
     return (op == LOAD_OP || op == TENSOR_CORE_LOAD_OP ||
@@ -930,6 +978,24 @@ class inst_t {
     return (op == STORE_OP || op == TENSOR_CORE_STORE_OP ||
             memory_op == memory_store);
   }
+
+  bool is_fp() const { return ((sp_op == FP__OP)); }  // VIJAY
+  bool is_fpdiv() const { return ((sp_op == FP_DIV_OP)); }
+  bool is_fpmul() const { return ((sp_op == FP_MUL_OP)); }
+  bool is_dp() const { return ((sp_op == DP___OP)); }
+  bool is_dpdiv() const { return ((sp_op == DP_DIV_OP)); }
+  bool is_dpmul() const { return ((sp_op == DP_MUL_OP)); }
+  bool is_imul() const { return ((sp_op == INT_MUL_OP)); }
+  bool is_imul24() const { return ((sp_op == INT_MUL24_OP)); }
+  bool is_imul32() const { return ((sp_op == INT_MUL32_OP)); }
+  bool is_idiv() const { return ((sp_op == INT_DIV_OP)); }
+  bool is_sfu() const {
+    return ((sp_op == FP_SQRT_OP) || (sp_op == FP_LG_OP) ||
+            (sp_op == FP_SIN_OP) || (sp_op == FP_EXP_OP) ||
+            (sp_op == TENSOR__OP));
+  }
+  bool is_alu() const { return (sp_op == INT__OP); }
+
   unsigned get_num_operands() const { return num_operands; }
   unsigned get_num_regs() const { return num_regs; }
   void set_num_regs(unsigned num) { num_regs = num; }
@@ -953,6 +1019,7 @@ class inst_t {
   operation_pipeline op_pipe;  // code (uarch visible) identify the pipeline of
                                // the operation (SP, SFU or MEM)
   mem_operation mem_op;        // code (uarch visible) identify memory type
+  bool const_cache_operand;    // has a load from constant memory as an operand
   _memory_op_t memory_op;      // memory_op used by ptxplus
   unsigned num_operands;
   unsigned num_regs;  // count vector operand as one register operand
@@ -996,11 +1063,20 @@ class warp_inst_t : public inst_t {
   // constructors
   warp_inst_t() {
     m_uid = 0;
+    m_streamID = (unsigned long long)-1;
     m_empty = true;
     m_config = NULL;
+
+    // Ni:
+    m_is_ldgsts = false;
+    m_is_ldgdepbar = false;
+    m_is_depbar = false;
+
+    m_depbar_group_no = 0;
   }
   warp_inst_t(const core_config *config) {
     m_uid = 0;
+    m_streamID = (unsigned long long)-1;
     assert(config->warp_size <= MAX_WARP_SIZE);
     m_config = config;
     m_empty = true;
@@ -1011,6 +1087,13 @@ class warp_inst_t : public inst_t {
     m_is_printf = false;
     m_is_cdp = 0;
     should_do_atomic = true;
+
+    // Ni:
+    m_is_ldgsts = false;
+    m_is_ldgdepbar = false;
+    m_is_depbar = false;
+
+    m_depbar_group_no = 0;
   }
   virtual ~warp_inst_t() {}
 
@@ -1021,7 +1104,8 @@ class warp_inst_t : public inst_t {
   void clear() { m_empty = true; }
 
   void issue(const active_mask_t &mask, unsigned warp_id,
-             unsigned long long cycle, int dynamic_warp_id, int sch_id);
+             unsigned long long cycle, int dynamic_warp_id, int sch_id,
+             unsigned long long streamID);
 
   const active_mask_t &get_active_mask() const { return m_warp_active_mask; }
   void completed(unsigned long long cycle)
@@ -1099,7 +1183,7 @@ class warp_inst_t : public inst_t {
 
   // accessors
   virtual void print_insn(FILE *fp) const {
-    fprintf(fp, " [inst @ pc=0x%04x] ", pc);
+    fprintf(fp, " [inst @ pc=0x%04llx] ", pc);
     for (int i = (int)m_config->warp_size - 1; i >= 0; i--)
       fprintf(fp, "%c", ((m_warp_active_mask[i]) ? '1' : '0'));
   }
@@ -1149,11 +1233,13 @@ class warp_inst_t : public inst_t {
 
   void print(FILE *fout) const;
   unsigned get_uid() const { return m_uid; }
+  unsigned long long get_streamID() const { return m_streamID; }
   unsigned get_schd_id() const { return m_scheduler_id; }
   active_mask_t get_warp_active_mask() const { return m_warp_active_mask; }
 
  protected:
   unsigned m_uid;
+  unsigned long long m_streamID;
   bool m_empty;
   bool m_cache_hit;
   unsigned long long issue_cycle;
@@ -1193,6 +1279,13 @@ class warp_inst_t : public inst_t {
   // Jin: cdp support
  public:
   int m_is_cdp;
+
+  // Ni: add boolean to indicate whether the instruction is ldgsts
+  bool m_is_ldgsts;
+  bool m_is_ldgdepbar;
+  bool m_is_depbar;
+
+  unsigned int m_depbar_group_no;
 };
 
 void move_warp(warp_inst_t *&dst, warp_inst_t *&src);
@@ -1291,6 +1384,7 @@ class register_set {
     }
     m_name = name;
   }
+  const char *get_name() { return m_name; }
   bool has_free() {
     for (unsigned i = 0; i < regs.size(); i++) {
       if (regs[i]->empty()) {
@@ -1315,7 +1409,35 @@ class register_set {
     }
     return false;
   }
+  bool has_ready(bool sub_core_model, unsigned reg_id) {
+    if (!sub_core_model) return has_ready();
+    assert(reg_id < regs.size());
+    return (not regs[reg_id]->empty());
+  }
 
+  unsigned get_ready_reg_id() {
+    // for sub core model we need to figure which reg_id has the ready warp
+    // this function should only be called if has_ready() was true
+    assert(has_ready());
+    warp_inst_t **ready;
+    ready = NULL;
+    unsigned reg_id = 0;
+    for (unsigned i = 0; i < regs.size(); i++) {
+      if (not regs[i]->empty()) {
+        if (ready and (*ready)->get_uid() < regs[i]->get_uid()) {
+          // ready is oldest
+        } else {
+          ready = &regs[i];
+          reg_id = i;
+        }
+      }
+    }
+    return reg_id;
+  }
+  unsigned get_schd_id(unsigned reg_id) {
+    assert(not regs[reg_id]->empty());
+    return regs[reg_id]->get_schd_id();
+  }
   void move_in(warp_inst_t *&src) {
     warp_inst_t **free = get_free();
     move_warp(*free, src);
@@ -1323,10 +1445,29 @@ class register_set {
   // void copy_in( warp_inst_t* src ){
   //   src->copy_contents_to(*get_free());
   //}
+  void move_in(bool sub_core_model, unsigned reg_id, warp_inst_t *&src) {
+    warp_inst_t **free;
+    if (!sub_core_model) {
+      free = get_free();
+    } else {
+      assert(reg_id < regs.size());
+      free = get_free(sub_core_model, reg_id);
+    }
+    move_warp(*free, src);
+  }
+
   void move_out_to(warp_inst_t *&dest) {
     warp_inst_t **ready = get_ready();
     move_warp(dest, *ready);
   }
+  void move_out_to(bool sub_core_model, unsigned reg_id, warp_inst_t *&dest) {
+    if (!sub_core_model) {
+      return move_out_to(dest);
+    }
+    warp_inst_t **ready = get_ready(sub_core_model, reg_id);
+    assert(ready != NULL);
+    move_warp(dest, *ready);
+  }
 
   warp_inst_t **get_ready() {
     warp_inst_t **ready;
@@ -1342,6 +1483,14 @@ class register_set {
     }
     return ready;
   }
+  warp_inst_t **get_ready(bool sub_core_model, unsigned reg_id) {
+    if (!sub_core_model) return get_ready();
+    warp_inst_t **ready;
+    ready = NULL;
+    assert(reg_id < regs.size());
+    if (not regs[reg_id]->empty()) ready = &regs[reg_id];
+    return ready;
+  }
 
   void print(FILE *fp) const {
     fprintf(fp, "%s : @%p\n", m_name, this);
diff --git a/src/gpuwattch/Alpha21364.xml b/src/accelwattch/Alpha21364.xml
similarity index 100%
rename from src/gpuwattch/Alpha21364.xml
rename to src/accelwattch/Alpha21364.xml
diff --git a/src/accelwattch/CMakeLists.txt b/src/accelwattch/CMakeLists.txt
new file mode 100644
index 000000000..cb7dd7178
--- /dev/null
+++ b/src/accelwattch/CMakeLists.txt
@@ -0,0 +1,46 @@
+set(GPGPUSIM_ACCELWATTCH_NTHREADS "4" CACHE STRING "Accelwattch MCPAT thread count")
+add_library(accelwattch STATIC 
+            cacti/Ucache.cc
+            XML_Parse.cc
+            cacti/arbiter.cc
+            cacti/area.cc
+            array.cc
+            cacti/bank.cc
+            cacti/basic_circuit.cc
+            basic_components.cc
+            cacti/cacti_interface.cc
+            cacti/component.cc
+            core.cc
+            cacti/crossbar.cc
+            cacti/decoder.cc
+            cacti/htree2.cc
+            interconnect.cc
+            cacti/io.cc
+            iocontrollers.cc
+            logic.cc
+            main.cc
+            cacti/mat.cc
+            memoryctrl.cc
+            noc.cc
+            cacti/nuca.cc
+            cacti/parameter.cc
+            processor.cc
+            cacti/router.cc
+            sharedcache.cc
+            cacti/subarray.cc
+            cacti/technology.cc
+            cacti/uca.cc
+            cacti/wire.cc
+            xmlParser.cc
+            gpgpu_sim_wrapper.cc)
+target_include_directories(accelwattch PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(accelwattch PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cacti)
+# Compile options
+target_compile_options(accelwattch PRIVATE "-Wno-unknown-pragmas")
+if($<CONFIG:Debug>)
+    target_compile_definitions(NTHREADS=1)
+else()
+    target_compile_options(accelwattch PRIVATE "-msse2;-mfpmath=sse")
+    target_compile_definitions(accelwattch PRIVATE -DNTHREADS=${GPGPUSIM_ACCELWATTCH_NTHREADS})
+endif()
+target_link_options(accelwattch PRIVATE "-lm;-lpthread;-lz")
\ No newline at end of file
diff --git a/src/gpuwattch/Niagara1.xml b/src/accelwattch/Niagara1.xml
similarity index 100%
rename from src/gpuwattch/Niagara1.xml
rename to src/accelwattch/Niagara1.xml
diff --git a/src/gpuwattch/Niagara1_sharing.xml b/src/accelwattch/Niagara1_sharing.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing.xml
rename to src/accelwattch/Niagara1_sharing.xml
diff --git a/src/gpuwattch/Niagara1_sharing_DC.xml b/src/accelwattch/Niagara1_sharing_DC.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing_DC.xml
rename to src/accelwattch/Niagara1_sharing_DC.xml
diff --git a/src/gpuwattch/Niagara1_sharing_SBT.xml b/src/accelwattch/Niagara1_sharing_SBT.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing_SBT.xml
rename to src/accelwattch/Niagara1_sharing_SBT.xml
diff --git a/src/gpuwattch/Niagara1_sharing_ST.xml b/src/accelwattch/Niagara1_sharing_ST.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing_ST.xml
rename to src/accelwattch/Niagara1_sharing_ST.xml
diff --git a/src/gpuwattch/Niagara2.xml b/src/accelwattch/Niagara2.xml
similarity index 100%
rename from src/gpuwattch/Niagara2.xml
rename to src/accelwattch/Niagara2.xml
diff --git a/src/gpuwattch/Penryn.xml b/src/accelwattch/Penryn.xml
similarity index 100%
rename from src/gpuwattch/Penryn.xml
rename to src/accelwattch/Penryn.xml
diff --git a/src/gpuwattch/README b/src/accelwattch/README
similarity index 100%
rename from src/gpuwattch/README
rename to src/accelwattch/README
diff --git a/src/gpuwattch/XML_Parse.cc b/src/accelwattch/XML_Parse.cc
similarity index 92%
rename from src/gpuwattch/XML_Parse.cc
rename to src/accelwattch/XML_Parse.cc
index 1b9a38ae1..801de6ff9 100644
--- a/src/gpuwattch/XML_Parse.cc
+++ b/src/accelwattch/XML_Parse.cc
@@ -30,10 +30,11 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
- *University of Wisconsin–Madison                * Tayler Hetherington,
- *University of British Columbia         * Ahmed ElTantawy, University of
- *British Columbia             *
+ * Jingwen Leng, University of Texas, Austin
+ * Syed Gilani, University of Wisconsin–Madison
+ * Tayler Hetherington, University of British Columbia
+ * Ahmed ElTantawy, University of British Columbia
+ * Vijay Kandiah, Northwestern University
  ********************************************************************/
 
 #include "XML_Parse.h"
@@ -44,12 +45,17 @@
 using namespace std;
 
 const char* perf_count_label[] = {
-    "TOT_INST,",    "FP_INT,",  "IC_H,",     "IC_M,",        "DC_RH,",
-    "DC_RM,",       "DC_WH,",   "DC_WM,",    "TC_H,",        "TC_M,",
-    "CC_H,",        "CC_M,",    "SHRD_ACC,", "REG_RD,",      "REG_WR,",
-    "NON_REG_OPs,", "SP_ACC,",  "SFU_ACC,",  "FPU_ACC,",     "MEM_RD,",
-    "MEM_WR,",      "MEM_PRE,", "L2_RH,",    "L2_RM,",       "L2_WH,",
-    "L2_WM,",       "NOC_A,",   "PIPE_A,",   "IDLE_CORE_N,", "CONST_DYNAMICN"};
+    "TOT_INST,",      "FP_INT,",      "IC_H,",        "IC_M,",
+    "DC_RH,",         "DC_RM,",       "DC_WH,",       "DC_WM,",
+    "TC_H,",          "TC_M,",        "CC_H,",        "CC_M,",
+    "SHRD_ACC,",      "REG_RD,",      "REG_WR,",      "NON_REG_OPs,",
+    "INT_ACC,",       "FPU_ACC,",     "DPU_ACC,",     "INT_MUL24_ACC,",
+    "INT_MUL32_ACC,", "INT_MUL_ACC,", "INT_DIV_ACC,", "FP_MUL_ACC,",
+    "FP_DIV_ACC,",    "FP_SQRT_ACC,", "FP_LG_ACC,",   "FP_SIN_ACC,",
+    "FP_EXP_ACC,",    "DP_MUL_ACC,",  "DP_DIV_ACC,",  "TENSOR_ACC,",
+    "TEX_ACC,",       "MEM_RD,",      "MEM_WR,",      "MEM_PRE,",
+    "L2_RH,",         "L2_RM,",       "L2_WH,",       "L2_WM,",
+    "NOC_A,",         "PIPE_A,",      "IDLE_CORE_N,", "constant_power"};
 
 void ParseXML::parse(char* filepath) {
   unsigned int i, j, k, m, n;
@@ -160,6 +166,196 @@ void ParseXML::parse(char* filepath) {
           atoi(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "modeled_chip_voltage_ref") == 0) {
+      sys.modeled_chip_voltage_ref =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat1_flane") == 0) {
+      sys.static_cat1_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat2_flane") == 0) {
+      sys.static_cat2_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat3_flane") == 0) {
+      sys.static_cat3_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat4_flane") == 0) {
+      sys.static_cat4_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat5_flane") == 0) {
+      sys.static_cat5_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat6_flane") == 0) {
+      sys.static_cat6_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_shared_flane") == 0) {
+      sys.static_shared_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l1_flane") == 0) {
+      sys.static_l1_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l2_flane") == 0) {
+      sys.static_l2_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_light_flane") == 0) {
+      sys.static_light_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intadd_flane") == 0) {
+      sys.static_intadd_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intmul_flane") == 0) {
+      sys.static_intmul_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_geomean_flane") == 0) {
+      sys.static_geomean_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat1_addlane") == 0) {
+      sys.static_cat1_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat2_addlane") == 0) {
+      sys.static_cat2_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat3_addlane") == 0) {
+      sys.static_cat3_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat4_addlane") == 0) {
+      sys.static_cat4_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat5_addlane") == 0) {
+      sys.static_cat5_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat6_addlane") == 0) {
+      sys.static_cat6_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_shared_addlane") == 0) {
+      sys.static_shared_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l1_addlane") == 0) {
+      sys.static_l1_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l2_addlane") == 0) {
+      sys.static_l2_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_light_addlane") == 0) {
+      sys.static_light_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intadd_addlane") == 0) {
+      sys.static_intadd_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intmul_addlane") == 0) {
+      sys.static_intmul_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_geomean_addlane") == 0) {
+      sys.static_geomean_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
                "target_chip_area") == 0) {
       sys.target_chip_area =
@@ -420,20 +616,104 @@ void ParseXML::parse(char* filepath) {
       continue;
     }
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "SP_ACC") == 0) {
-      sys.scaling_coefficients[SP_ACC] =
+               "INT_ACC") == 0) {
+      sys.scaling_coefficients[INT_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_ACC") == 0) {
+      sys.scaling_coefficients[FP_ACC] =
           atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "SFU_ACC") == 0) {
-      sys.scaling_coefficients[SFU_ACC] =
+               "DP_ACC") == 0) {
+      sys.scaling_coefficients[DP_ACC] =
           atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "FPU_ACC") == 0) {
-      sys.scaling_coefficients[FPU_ACC] =
+               "INT_MUL24_ACC") == 0) {
+      sys.scaling_coefficients[INT_MUL24_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "INT_MUL32_ACC") == 0) {
+      sys.scaling_coefficients[INT_MUL32_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "INT_MUL_ACC") == 0) {
+      sys.scaling_coefficients[INT_MUL_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "INT_DIV_ACC") == 0) {
+      sys.scaling_coefficients[INT_DIV_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_MUL_ACC") == 0) {
+      sys.scaling_coefficients[FP_MUL_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_DIV_ACC") == 0) {
+      sys.scaling_coefficients[FP_DIV_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_SQRT_ACC") == 0) {
+      sys.scaling_coefficients[FP_SQRT_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_LG_ACC") == 0) {
+      sys.scaling_coefficients[FP_LG_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_SIN_ACC") == 0) {
+      sys.scaling_coefficients[FP_SIN_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_EXP_ACC") == 0) {
+      sys.scaling_coefficients[FP_EXP_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "DP_MUL_ACC") == 0) {
+      sys.scaling_coefficients[DP_MUL_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "DP_DIV_ACC") == 0) {
+      sys.scaling_coefficients[DP_DIV_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "TENSOR_ACC") == 0) {
+      sys.scaling_coefficients[TENSOR_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "TEX_ACC") == 0) {
+      sys.scaling_coefficients[TEX_ACC] =
           atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
@@ -498,8 +778,8 @@ void ParseXML::parse(char* filepath) {
       continue;
     }
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "CONST_DYNAMICN") == 0) {
-      sys.scaling_coefficients[CONST_DYNAMICN] =
+               "constant_power") == 0) {
+      sys.scaling_coefficients[constant_power] =
           atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
@@ -4187,8 +4467,9 @@ void ParseXML::initialize()  // Initialize all
   // strcpy(sys.homogeneous_cores,"default");
   sys.core_tech_node = 1;
   sys.target_core_clockrate = 1;
+  sys.modeled_chip_voltage_ref = 1;
   sys.target_chip_area = 1;
-  sys.temperature = 1;
+  sys.temperature = 340;
   sys.number_cache_levels = 1;
   sys.homogeneous_cores = 1;
   sys.homogeneous_L1Directories = 1;
@@ -4198,6 +4479,34 @@ void ParseXML::initialize()  // Initialize all
   sys.homogeneous_NoCs = 1;
   sys.homogeneous_ccs = 1;
 
+  sys.static_cat1_flane = 0;
+  sys.static_cat2_flane = 0;
+  sys.static_cat3_flane = 0;
+  sys.static_cat4_flane = 0;
+  sys.static_cat5_flane = 0;
+  sys.static_cat6_flane = 0;
+  sys.static_shared_flane = 0;
+  sys.static_l1_flane = 0;
+  sys.static_l2_flane = 0;
+  sys.static_light_flane = 0;
+  sys.static_intadd_flane = 0;
+  sys.static_intmul_flane = 0;
+  sys.static_geomean_flane = 0;
+
+  sys.static_cat1_addlane = 0;
+  sys.static_cat2_addlane = 0;
+  sys.static_cat3_addlane = 0;
+  sys.static_cat4_addlane = 0;
+  sys.static_cat5_addlane = 0;
+  sys.static_cat6_addlane = 0;
+  sys.static_shared_addlane = 0;
+  sys.static_l1_addlane = 0;
+  sys.static_l2_addlane = 0;
+  sys.static_light_addlane = 0;
+  sys.static_intadd_addlane = 0;
+  sys.static_intmul_addlane = 0;
+  sys.static_geomean_addlane = 0;
+
   sys.Max_area_deviation = 1;
   sys.Max_power_deviation = 1;
   sys.device_type = 1;
diff --git a/src/gpuwattch/XML_Parse.h b/src/accelwattch/XML_Parse.h
similarity index 88%
rename from src/gpuwattch/XML_Parse.h
rename to src/accelwattch/XML_Parse.h
index 30c4e4b13..176b82f6e 100644
--- a/src/gpuwattch/XML_Parse.h
+++ b/src/accelwattch/XML_Parse.h
@@ -30,10 +30,11 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
- *University of Wisconsin–Madison                * Tayler Hetherington,
- *University of British Columbia         * Ahmed ElTantawy, University of
- *British Columbia             *
+ * Jingwen Leng, University of Texas, Austin
+ * Syed Gilani, University of Wisconsin–Madison
+ * Tayler Hetherington, University of British Columbia
+ * Ahmed ElTantawy, University of British Columbia
+ * Vijay Kandiah, Northwestern University
  ********************************************************************/
 
 #ifndef XML_PARSE_H_
@@ -85,9 +86,23 @@ enum perf_count_t {
   REG_RD,
   REG_WR,
   NON_REG_OPs,
-  SP_ACC,
-  SFU_ACC,
-  FPU_ACC,
+  INT_ACC,        // SPU
+  FP_ACC,         // FPU
+  DP_ACC,         // FPU
+  INT_MUL24_ACC,  // SFU
+  INT_MUL32_ACC,  // SFU
+  INT_MUL_ACC,    // SFU
+  INT_DIV_ACC,    // SFU
+  FP_MUL_ACC,     // SFU
+  FP_DIV_ACC,     // SFU
+  FP_SQRT_ACC,    // SFU
+  FP_LG_ACC,      // SFU
+  FP_SIN_ACC,     // SFU
+  FP_EXP_ACC,     // SFU
+  DP_MUL_ACC,     // SFU
+  DP_DIV_ACC,     // SFU
+  TENSOR_ACC,     // SFU
+  TEX_ACC,        // SFU
   MEM_RD,
   MEM_WR,
   MEM_PRE,
@@ -98,7 +113,7 @@ enum perf_count_t {
   NOC_A,
   PIPE_A,
   IDLE_CORE_N,
-  CONST_DYNAMICN,
+  constant_power,
   NUM_PERFORMANCE_COUNTERS
 };
 
@@ -635,6 +650,33 @@ typedef struct {
   int homogeneous_L2Directories;
   double core_tech_node;
   int target_core_clockrate;
+  double modeled_chip_voltage_ref;
+  double static_cat1_flane;
+  double static_cat2_flane;
+  double static_cat3_flane;
+  double static_cat4_flane;
+  double static_cat5_flane;
+  double static_cat6_flane;
+  double static_shared_flane;
+  double static_l1_flane;
+  double static_l2_flane;
+  double static_light_flane;
+  double static_intadd_flane;
+  double static_intmul_flane;
+  double static_geomean_flane;
+  double static_cat1_addlane;
+  double static_cat2_addlane;
+  double static_cat3_addlane;
+  double static_cat4_addlane;
+  double static_cat5_addlane;
+  double static_cat6_addlane;
+  double static_shared_addlane;
+  double static_l1_addlane;
+  double static_l2_addlane;
+  double static_light_addlane;
+  double static_intadd_addlane;
+  double static_intmul_addlane;
+  double static_geomean_addlane;
   int target_chip_area;
   int temperature;
   int number_cache_levels;
diff --git a/src/gpuwattch/Xeon.xml b/src/accelwattch/Xeon.xml
similarity index 100%
rename from src/gpuwattch/Xeon.xml
rename to src/accelwattch/Xeon.xml
diff --git a/src/gpuwattch/arch_const.h b/src/accelwattch/arch_const.h
similarity index 100%
rename from src/gpuwattch/arch_const.h
rename to src/accelwattch/arch_const.h
diff --git a/src/gpuwattch/array.cc b/src/accelwattch/array.cc
similarity index 100%
rename from src/gpuwattch/array.cc
rename to src/accelwattch/array.cc
diff --git a/src/gpuwattch/array.h b/src/accelwattch/array.h
similarity index 100%
rename from src/gpuwattch/array.h
rename to src/accelwattch/array.h
diff --git a/src/gpuwattch/basic_components.cc b/src/accelwattch/basic_components.cc
similarity index 100%
rename from src/gpuwattch/basic_components.cc
rename to src/accelwattch/basic_components.cc
diff --git a/src/gpuwattch/basic_components.h b/src/accelwattch/basic_components.h
similarity index 100%
rename from src/gpuwattch/basic_components.h
rename to src/accelwattch/basic_components.h
diff --git a/src/gpuwattch/cacti/README b/src/accelwattch/cacti/README
similarity index 100%
rename from src/gpuwattch/cacti/README
rename to src/accelwattch/cacti/README
diff --git a/src/gpuwattch/cacti/Ucache.cc b/src/accelwattch/cacti/Ucache.cc
similarity index 99%
rename from src/gpuwattch/cacti/Ucache.cc
rename to src/accelwattch/cacti/Ucache.cc
index 8f733f73b..e92e67b91 100644
--- a/src/gpuwattch/cacti/Ucache.cc
+++ b/src/accelwattch/cacti/Ucache.cc
@@ -223,7 +223,7 @@ void * calc_time_mt_wrapper(void * void_obj)
   delete tag_arr.back();
   data_arr.pop_back();
   tag_arr.pop_back();
-
+  pthread_exit(NULL);
 }
 
 
@@ -246,7 +246,7 @@ bool calculate_time(
 {
   DynamicParameter dyn_p(is_tag, pure_ram, pure_cam, Nspd, Ndwl, Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, is_main_mem);
 
-  if (dyn_p.is_valid == false)
+  if (dyn_p.is_valid != true)
   {
     return false;
   }
diff --git a/src/gpuwattch/cacti/Ucache.h b/src/accelwattch/cacti/Ucache.h
similarity index 100%
rename from src/gpuwattch/cacti/Ucache.h
rename to src/accelwattch/cacti/Ucache.h
diff --git a/src/gpuwattch/cacti/arbiter.cc b/src/accelwattch/cacti/arbiter.cc
similarity index 100%
rename from src/gpuwattch/cacti/arbiter.cc
rename to src/accelwattch/cacti/arbiter.cc
diff --git a/src/gpuwattch/cacti/arbiter.h b/src/accelwattch/cacti/arbiter.h
similarity index 100%
rename from src/gpuwattch/cacti/arbiter.h
rename to src/accelwattch/cacti/arbiter.h
diff --git a/src/gpuwattch/cacti/area.cc b/src/accelwattch/cacti/area.cc
similarity index 100%
rename from src/gpuwattch/cacti/area.cc
rename to src/accelwattch/cacti/area.cc
diff --git a/src/gpuwattch/cacti/area.h b/src/accelwattch/cacti/area.h
similarity index 100%
rename from src/gpuwattch/cacti/area.h
rename to src/accelwattch/cacti/area.h
diff --git a/src/gpuwattch/cacti/bank.cc b/src/accelwattch/cacti/bank.cc
similarity index 100%
rename from src/gpuwattch/cacti/bank.cc
rename to src/accelwattch/cacti/bank.cc
diff --git a/src/gpuwattch/cacti/bank.h b/src/accelwattch/cacti/bank.h
similarity index 100%
rename from src/gpuwattch/cacti/bank.h
rename to src/accelwattch/cacti/bank.h
diff --git a/src/gpuwattch/cacti/basic_circuit.cc b/src/accelwattch/cacti/basic_circuit.cc
similarity index 100%
rename from src/gpuwattch/cacti/basic_circuit.cc
rename to src/accelwattch/cacti/basic_circuit.cc
diff --git a/src/gpuwattch/cacti/basic_circuit.h b/src/accelwattch/cacti/basic_circuit.h
similarity index 100%
rename from src/gpuwattch/cacti/basic_circuit.h
rename to src/accelwattch/cacti/basic_circuit.h
diff --git a/src/gpuwattch/cacti/batch_tests b/src/accelwattch/cacti/batch_tests
similarity index 100%
rename from src/gpuwattch/cacti/batch_tests
rename to src/accelwattch/cacti/batch_tests
diff --git a/src/gpuwattch/cacti/cache.cfg b/src/accelwattch/cacti/cache.cfg
similarity index 100%
rename from src/gpuwattch/cacti/cache.cfg
rename to src/accelwattch/cacti/cache.cfg
diff --git a/src/gpuwattch/cacti/cacti.i b/src/accelwattch/cacti/cacti.i
similarity index 100%
rename from src/gpuwattch/cacti/cacti.i
rename to src/accelwattch/cacti/cacti.i
diff --git a/src/gpuwattch/cacti/cacti.mk b/src/accelwattch/cacti/cacti.mk
similarity index 96%
rename from src/gpuwattch/cacti/cacti.mk
rename to src/accelwattch/cacti/cacti.mk
index 7f3c57338..41f9218f4 100644
--- a/src/gpuwattch/cacti/cacti.mk
+++ b/src/accelwattch/cacti/cacti.mk
@@ -1,5 +1,5 @@
 
-OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/gpuwattch/cacti
+OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/accelwattch/cacti
 TARGET = cacti
 SHELL = /bin/sh
 .PHONY: all depend clean
diff --git a/src/gpuwattch/cacti/cacti_interface.cc b/src/accelwattch/cacti/cacti_interface.cc
similarity index 100%
rename from src/gpuwattch/cacti/cacti_interface.cc
rename to src/accelwattch/cacti/cacti_interface.cc
diff --git a/src/gpuwattch/cacti/cacti_interface.h b/src/accelwattch/cacti/cacti_interface.h
similarity index 100%
rename from src/gpuwattch/cacti/cacti_interface.h
rename to src/accelwattch/cacti/cacti_interface.h
diff --git a/src/gpuwattch/cacti/component.cc b/src/accelwattch/cacti/component.cc
similarity index 100%
rename from src/gpuwattch/cacti/component.cc
rename to src/accelwattch/cacti/component.cc
diff --git a/src/gpuwattch/cacti/component.h b/src/accelwattch/cacti/component.h
similarity index 100%
rename from src/gpuwattch/cacti/component.h
rename to src/accelwattch/cacti/component.h
diff --git a/src/gpuwattch/cacti/const.h b/src/accelwattch/cacti/const.h
similarity index 100%
rename from src/gpuwattch/cacti/const.h
rename to src/accelwattch/cacti/const.h
diff --git a/src/gpuwattch/cacti/contention.dat b/src/accelwattch/cacti/contention.dat
similarity index 100%
rename from src/gpuwattch/cacti/contention.dat
rename to src/accelwattch/cacti/contention.dat
diff --git a/src/gpuwattch/cacti/crossbar.cc b/src/accelwattch/cacti/crossbar.cc
similarity index 100%
rename from src/gpuwattch/cacti/crossbar.cc
rename to src/accelwattch/cacti/crossbar.cc
diff --git a/src/gpuwattch/cacti/crossbar.h b/src/accelwattch/cacti/crossbar.h
similarity index 100%
rename from src/gpuwattch/cacti/crossbar.h
rename to src/accelwattch/cacti/crossbar.h
diff --git a/src/gpuwattch/cacti/decoder.cc b/src/accelwattch/cacti/decoder.cc
similarity index 100%
rename from src/gpuwattch/cacti/decoder.cc
rename to src/accelwattch/cacti/decoder.cc
diff --git a/src/gpuwattch/cacti/decoder.h b/src/accelwattch/cacti/decoder.h
similarity index 100%
rename from src/gpuwattch/cacti/decoder.h
rename to src/accelwattch/cacti/decoder.h
diff --git a/src/gpuwattch/cacti/highradix.cc b/src/accelwattch/cacti/highradix.cc
similarity index 100%
rename from src/gpuwattch/cacti/highradix.cc
rename to src/accelwattch/cacti/highradix.cc
diff --git a/src/gpuwattch/cacti/highradix.h b/src/accelwattch/cacti/highradix.h
similarity index 100%
rename from src/gpuwattch/cacti/highradix.h
rename to src/accelwattch/cacti/highradix.h
diff --git a/src/gpuwattch/cacti/htree2.cc b/src/accelwattch/cacti/htree2.cc
similarity index 100%
rename from src/gpuwattch/cacti/htree2.cc
rename to src/accelwattch/cacti/htree2.cc
diff --git a/src/gpuwattch/cacti/htree2.h b/src/accelwattch/cacti/htree2.h
similarity index 100%
rename from src/gpuwattch/cacti/htree2.h
rename to src/accelwattch/cacti/htree2.h
diff --git a/src/gpuwattch/cacti/io.cc b/src/accelwattch/cacti/io.cc
similarity index 100%
rename from src/gpuwattch/cacti/io.cc
rename to src/accelwattch/cacti/io.cc
diff --git a/src/gpuwattch/cacti/io.h b/src/accelwattch/cacti/io.h
similarity index 100%
rename from src/gpuwattch/cacti/io.h
rename to src/accelwattch/cacti/io.h
diff --git a/src/gpuwattch/cacti/main.cc b/src/accelwattch/cacti/main.cc
similarity index 100%
rename from src/gpuwattch/cacti/main.cc
rename to src/accelwattch/cacti/main.cc
diff --git a/src/gpuwattch/cacti/makefile b/src/accelwattch/cacti/makefile
similarity index 100%
rename from src/gpuwattch/cacti/makefile
rename to src/accelwattch/cacti/makefile
diff --git a/src/gpuwattch/cacti/mat.cc b/src/accelwattch/cacti/mat.cc
similarity index 100%
rename from src/gpuwattch/cacti/mat.cc
rename to src/accelwattch/cacti/mat.cc
diff --git a/src/gpuwattch/cacti/mat.h b/src/accelwattch/cacti/mat.h
similarity index 100%
rename from src/gpuwattch/cacti/mat.h
rename to src/accelwattch/cacti/mat.h
diff --git a/src/gpuwattch/cacti/nuca.cc b/src/accelwattch/cacti/nuca.cc
similarity index 100%
rename from src/gpuwattch/cacti/nuca.cc
rename to src/accelwattch/cacti/nuca.cc
diff --git a/src/gpuwattch/cacti/nuca.h b/src/accelwattch/cacti/nuca.h
similarity index 100%
rename from src/gpuwattch/cacti/nuca.h
rename to src/accelwattch/cacti/nuca.h
diff --git a/src/gpuwattch/cacti/out_batch_test_result.csv b/src/accelwattch/cacti/out_batch_test_result.csv
similarity index 100%
rename from src/gpuwattch/cacti/out_batch_test_result.csv
rename to src/accelwattch/cacti/out_batch_test_result.csv
diff --git a/src/gpuwattch/cacti/parameter.cc b/src/accelwattch/cacti/parameter.cc
similarity index 100%
rename from src/gpuwattch/cacti/parameter.cc
rename to src/accelwattch/cacti/parameter.cc
diff --git a/src/gpuwattch/cacti/parameter.h b/src/accelwattch/cacti/parameter.h
similarity index 100%
rename from src/gpuwattch/cacti/parameter.h
rename to src/accelwattch/cacti/parameter.h
diff --git a/src/gpuwattch/cacti/router.cc b/src/accelwattch/cacti/router.cc
similarity index 100%
rename from src/gpuwattch/cacti/router.cc
rename to src/accelwattch/cacti/router.cc
diff --git a/src/gpuwattch/cacti/router.h b/src/accelwattch/cacti/router.h
similarity index 100%
rename from src/gpuwattch/cacti/router.h
rename to src/accelwattch/cacti/router.h
diff --git a/src/gpuwattch/cacti/subarray.cc b/src/accelwattch/cacti/subarray.cc
similarity index 100%
rename from src/gpuwattch/cacti/subarray.cc
rename to src/accelwattch/cacti/subarray.cc
diff --git a/src/gpuwattch/cacti/subarray.h b/src/accelwattch/cacti/subarray.h
similarity index 100%
rename from src/gpuwattch/cacti/subarray.h
rename to src/accelwattch/cacti/subarray.h
diff --git a/src/gpuwattch/cacti/technology.cc b/src/accelwattch/cacti/technology.cc
similarity index 100%
rename from src/gpuwattch/cacti/technology.cc
rename to src/accelwattch/cacti/technology.cc
diff --git a/src/gpuwattch/cacti/uca.cc b/src/accelwattch/cacti/uca.cc
similarity index 100%
rename from src/gpuwattch/cacti/uca.cc
rename to src/accelwattch/cacti/uca.cc
diff --git a/src/gpuwattch/cacti/uca.h b/src/accelwattch/cacti/uca.h
similarity index 100%
rename from src/gpuwattch/cacti/uca.h
rename to src/accelwattch/cacti/uca.h
diff --git a/src/gpuwattch/cacti/wire.cc b/src/accelwattch/cacti/wire.cc
similarity index 100%
rename from src/gpuwattch/cacti/wire.cc
rename to src/accelwattch/cacti/wire.cc
diff --git a/src/gpuwattch/cacti/wire.h b/src/accelwattch/cacti/wire.h
similarity index 100%
rename from src/gpuwattch/cacti/wire.h
rename to src/accelwattch/cacti/wire.h
diff --git a/src/gpuwattch/core.cc b/src/accelwattch/core.cc
similarity index 100%
rename from src/gpuwattch/core.cc
rename to src/accelwattch/core.cc
diff --git a/src/gpuwattch/core.h b/src/accelwattch/core.h
similarity index 100%
rename from src/gpuwattch/core.h
rename to src/accelwattch/core.h
diff --git a/src/gpuwattch/fermi.xml b/src/accelwattch/fermi.xml
similarity index 100%
rename from src/gpuwattch/fermi.xml
rename to src/accelwattch/fermi.xml
diff --git a/src/gpuwattch/globalvar.h b/src/accelwattch/globalvar.h
similarity index 100%
rename from src/gpuwattch/globalvar.h
rename to src/accelwattch/globalvar.h
diff --git a/src/gpuwattch/gpgpu.xml b/src/accelwattch/gpgpu.xml
similarity index 100%
rename from src/gpuwattch/gpgpu.xml
rename to src/accelwattch/gpgpu.xml
diff --git a/src/gpuwattch/gpgpu_sim.verify b/src/accelwattch/gpgpu_sim.verify
similarity index 100%
rename from src/gpuwattch/gpgpu_sim.verify
rename to src/accelwattch/gpgpu_sim.verify
diff --git a/src/gpuwattch/gpgpu_sim_wrapper.cc b/src/accelwattch/gpgpu_sim_wrapper.cc
similarity index 59%
rename from src/gpuwattch/gpgpu_sim_wrapper.cc
rename to src/accelwattch/gpgpu_sim_wrapper.cc
index f2989f630..4883c7c54 100644
--- a/src/gpuwattch/gpgpu_sim_wrapper.cc
+++ b/src/accelwattch/gpgpu_sim_wrapper.cc
@@ -1,18 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy,
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy,
+// Vijay Kandiah, Nikos Hardavellas The University of British Columbia,
+// Northwestern University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -32,9 +34,13 @@
 #define SFU_BASE_POWER 0
 
 static const char* pwr_cmp_label[] = {
-    "IBP,", "ICP,",  "DCP,",   "TCP,",   "CCP,",        "SHRDP,",
-    "RFP,", "SPP,",  "SFUP,",  "FPUP,",  "SCHEDP,",     "L2CP,",
-    "MCP,", "NOCP,", "DRAMP,", "PIPEP,", "IDLE_COREP,", "CONST_DYNAMICP"};
+    "IBP,",        "ICP,",        "DCP,",      "TCP,",      "CCP,",
+    "SHRDP,",      "RFP,",        "INTP,",     "FPUP,",     "DPUP,",
+    "INT_MUL24P,", "INT_MUL32P,", "INT_MULP,", "INT_DIVP,", "FP_MULP,",
+    "FP_DIVP,",    "FP_SQRTP,",   "FP_LGP,",   "FP_SINP,",  "FP_EXP,",
+    "DP_MULP,",    "DP_DIVP,",    "TENSORP,",  "TEXP,",     "SCHEDP,",
+    "L2CP,",       "MCP,",        "NOCP,",     "DRAMP,",    "PIPEP,",
+    "IDLE_COREP,", "CONSTP",      "STATICP"};
 
 enum pwr_cmp_t {
   IBP = 0,
@@ -44,9 +50,23 @@ enum pwr_cmp_t {
   CCP,
   SHRDP,
   RFP,
-  SPP,
-  SFUP,
+  INTP,
   FPUP,
+  DPUP,
+  INT_MUL24P,
+  INT_MUL32P,
+  INT_MULP,
+  INT_DIVP,
+  FP_MULP,
+  FP_DIVP,
+  FP_SQRTP,
+  FP_LGP,
+  FP_SINP,
+  FP_EXP,
+  DP_MULP,
+  DP_DIVP,
+  TENSORP,
+  TEXP,
   SCHEDP,
   L2CP,
   MCP,
@@ -54,17 +74,19 @@ enum pwr_cmp_t {
   DRAMP,
   PIPEP,
   IDLE_COREP,
-  CONST_DYNAMICP,
+  CONSTP,
+  STATICP,
   NUM_COMPONENTS_MODELLED
 };
 
 gpgpu_sim_wrapper::gpgpu_sim_wrapper(bool power_simulation_enabled,
-                                     char* xmlfile) {
+                                     char* xmlfile, int power_simulation_mode,
+                                     bool dvfs_enabled) {
   kernel_sample_count = 0;
   total_sample_count = 0;
 
   kernel_tot_power = 0;
-
+  avg_threads_per_warp_tot = 0;
   num_pwr_cmps = NUM_COMPONENTS_MODELLED;
   num_perf_counters = NUM_PERFORMANCE_COUNTERS;
 
@@ -91,6 +113,8 @@ gpgpu_sim_wrapper::gpgpu_sim_wrapper(bool power_simulation_enabled,
   g_steady_state_tracking_filename = NULL;
   xml_filename = xmlfile;
   g_power_simulation_enabled = power_simulation_enabled;
+  g_power_simulation_mode = power_simulation_mode;
+  g_dvfs_enabled = dvfs_enabled;
   g_power_trace_enabled = false;
   g_steady_power_levels_enabled = false;
   g_power_trace_zlevel = 0;
@@ -121,13 +145,19 @@ bool gpgpu_sim_wrapper::sanity_check(double a, double b) {
 
   return false;
 }
+void gpgpu_sim_wrapper::init_mcpat_hw_mode(unsigned gpu_sim_cycle) {
+  p->sys.total_cycles =
+      gpu_sim_cycle;  // total simulated cycles for current kernel
+}
+
 void gpgpu_sim_wrapper::init_mcpat(
     char* xmlfile, char* powerfilename, char* power_trace_filename,
     char* metric_trace_filename, char* steady_state_filename,
     bool power_sim_enabled, bool trace_enabled, bool steady_state_enabled,
     bool power_per_cycle_dump, double steady_power_deviation,
-    double steady_min_period, int zlevel, double init_val,
-    int stat_sample_freq) {
+    double steady_min_period, int zlevel, double init_val, int stat_sample_freq,
+    int power_sim_mode, bool dvfs_enabled, unsigned clock_freq,
+    unsigned num_shaders) {
   // Write File Headers for (-metrics trace, -power trace)
 
   reset_counters();
@@ -151,6 +181,8 @@ void gpgpu_sim_wrapper::init_mcpat(
     g_steady_state_tracking_filename = steady_state_filename;
     xml_filename = xmlfile;
     g_power_simulation_enabled = power_sim_enabled;
+    g_power_simulation_mode = power_sim_mode;
+    g_dvfs_enabled = dvfs_enabled;
     g_power_trace_enabled = trace_enabled;
     g_steady_power_levels_enabled = steady_state_enabled;
     g_power_trace_zlevel = zlevel;
@@ -162,6 +194,9 @@ void gpgpu_sim_wrapper::init_mcpat(
 
     // p->sys.total_cycles=gpu_stat_sample_freq*4;
     p->sys.total_cycles = gpu_stat_sample_freq;
+    p->sys.target_core_clockrate = clock_freq;
+    p->sys.number_of_cores = num_shaders;
+    p->sys.core[0].clock_rate = clock_freq;
     power_trace_file = NULL;
     metric_trace_file = NULL;
     steady_state_tacking_file = NULL;
@@ -232,7 +267,7 @@ void gpgpu_sim_wrapper::reset_counters() {
   kernel_sample_count = 0;
   kernel_tot_power = 0;
   kernel_power = init;
-
+  avg_threads_per_warp_tot = 0;
   return;
 }
 
@@ -352,9 +387,12 @@ void gpgpu_sim_wrapper::set_l2cache_power(double read_hits, double read_misses,
   sample_perf_counters[L2_WM] = write_misses;
 }
 
+void gpgpu_sim_wrapper::set_num_cores(double num_core) { num_cores = num_core; }
+
 void gpgpu_sim_wrapper::set_idle_core_power(double num_idle_core) {
   p->sys.num_idle_cores = num_idle_core;
   sample_perf_counters[IDLE_CORE_N] = num_idle_core;
+  num_idle_cores = num_idle_core;
 }
 
 void gpgpu_sim_wrapper::set_duty_cycle_power(double duty_cycle) {
@@ -375,21 +413,95 @@ void gpgpu_sim_wrapper::set_mem_ctrl_power(double reads, double writes,
   sample_perf_counters[MEM_PRE] = dram_precharge;
 }
 
+void gpgpu_sim_wrapper::set_model_voltage(double model_voltage) {
+  modeled_chip_voltage = model_voltage;
+}
+
 void gpgpu_sim_wrapper::set_exec_unit_power(double fpu_accesses,
                                             double ialu_accesses,
                                             double sfu_accesses) {
-  p->sys.core[0].fpu_accesses =
-      fpu_accesses * p->sys.scaling_coefficients[FPU_ACC];
+  p->sys.core[0].fpu_accesses = fpu_accesses;
+  tot_fpu_accesses = fpu_accesses;
   // Integer ALU (not present in Tesla)
-  p->sys.core[0].ialu_accesses =
-      ialu_accesses * p->sys.scaling_coefficients[SP_ACC];
+  p->sys.core[0].ialu_accesses = ialu_accesses;
+
   // Sfu accesses
-  p->sys.core[0].mul_accesses =
-      sfu_accesses * p->sys.scaling_coefficients[SFU_ACC];
+  p->sys.core[0].mul_accesses = sfu_accesses;
+  tot_sfu_accesses = sfu_accesses;
+}
+
+PowerscalingCoefficients* gpgpu_sim_wrapper::get_scaling_coeffs() {
+  PowerscalingCoefficients* scalingCoeffs = new PowerscalingCoefficients();
+
+  scalingCoeffs->int_coeff = p->sys.scaling_coefficients[INT_ACC];
+  scalingCoeffs->int_mul_coeff = p->sys.scaling_coefficients[INT_MUL_ACC];
+  scalingCoeffs->int_mul24_coeff = p->sys.scaling_coefficients[INT_MUL24_ACC];
+  scalingCoeffs->int_mul32_coeff = p->sys.scaling_coefficients[INT_MUL32_ACC];
+  scalingCoeffs->int_div_coeff = p->sys.scaling_coefficients[INT_DIV_ACC];
+  scalingCoeffs->fp_coeff = p->sys.scaling_coefficients[FP_ACC];
+  scalingCoeffs->dp_coeff = p->sys.scaling_coefficients[DP_ACC];
+  scalingCoeffs->fp_mul_coeff = p->sys.scaling_coefficients[FP_MUL_ACC];
+  scalingCoeffs->fp_div_coeff = p->sys.scaling_coefficients[FP_DIV_ACC];
+  scalingCoeffs->dp_mul_coeff = p->sys.scaling_coefficients[DP_MUL_ACC];
+  scalingCoeffs->dp_div_coeff = p->sys.scaling_coefficients[DP_DIV_ACC];
+  scalingCoeffs->sqrt_coeff = p->sys.scaling_coefficients[FP_SQRT_ACC];
+  scalingCoeffs->log_coeff = p->sys.scaling_coefficients[FP_LG_ACC];
+  scalingCoeffs->sin_coeff = p->sys.scaling_coefficients[FP_SIN_ACC];
+  scalingCoeffs->exp_coeff = p->sys.scaling_coefficients[FP_EXP_ACC];
+  scalingCoeffs->tensor_coeff = p->sys.scaling_coefficients[TENSOR_ACC];
+  scalingCoeffs->tex_coeff = p->sys.scaling_coefficients[TEX_ACC];
+  return scalingCoeffs;
+}
+
+void gpgpu_sim_wrapper::set_int_accesses(double ialu_accesses,
+                                         double imul24_accesses,
+                                         double imul32_accesses,
+                                         double imul_accesses,
+                                         double idiv_accesses) {
+  sample_perf_counters[INT_ACC] = ialu_accesses;
+  sample_perf_counters[INT_MUL24_ACC] = imul24_accesses;
+  sample_perf_counters[INT_MUL32_ACC] = imul32_accesses;
+  sample_perf_counters[INT_MUL_ACC] = imul_accesses;
+  sample_perf_counters[INT_DIV_ACC] = idiv_accesses;
+}
+
+void gpgpu_sim_wrapper::set_dp_accesses(double dpu_accesses,
+                                        double dpmul_accesses,
+                                        double dpdiv_accesses) {
+  sample_perf_counters[DP_ACC] = dpu_accesses;
+  sample_perf_counters[DP_MUL_ACC] = dpmul_accesses;
+  sample_perf_counters[DP_DIV_ACC] = dpdiv_accesses;
+}
+
+void gpgpu_sim_wrapper::set_fp_accesses(double fpu_accesses,
+                                        double fpmul_accesses,
+                                        double fpdiv_accesses) {
+  sample_perf_counters[FP_ACC] = fpu_accesses;
+  sample_perf_counters[FP_MUL_ACC] = fpmul_accesses;
+  sample_perf_counters[FP_DIV_ACC] = fpdiv_accesses;
+}
+
+void gpgpu_sim_wrapper::set_trans_accesses(double sqrt_accesses,
+                                           double log_accesses,
+                                           double sin_accesses,
+                                           double exp_accesses) {
+  sample_perf_counters[FP_SQRT_ACC] = sqrt_accesses;
+  sample_perf_counters[FP_LG_ACC] = log_accesses;
+  sample_perf_counters[FP_SIN_ACC] = sin_accesses;
+  sample_perf_counters[FP_EXP_ACC] = exp_accesses;
+}
+
+void gpgpu_sim_wrapper::set_tensor_accesses(double tensor_accesses) {
+  sample_perf_counters[TENSOR_ACC] = tensor_accesses;
+}
+
+void gpgpu_sim_wrapper::set_tex_accesses(double tex_accesses) {
+  sample_perf_counters[TEX_ACC] = tex_accesses;
+}
 
-  sample_perf_counters[SP_ACC] = ialu_accesses;
-  sample_perf_counters[SFU_ACC] = sfu_accesses;
-  sample_perf_counters[FPU_ACC] = fpu_accesses;
+void gpgpu_sim_wrapper::set_avg_active_threads(float active_threads) {
+  avg_threads_per_warp = (unsigned)ceil(active_threads);
+  avg_threads_per_warp_tot += active_threads;
 }
 
 void gpgpu_sim_wrapper::set_active_lanes_power(double sp_avg_active_lane,
@@ -398,12 +510,10 @@ void gpgpu_sim_wrapper::set_active_lanes_power(double sp_avg_active_lane,
   p->sys.core[0].sfu_average_active_lanes = sfu_avg_active_lane;
 }
 
-void gpgpu_sim_wrapper::set_NoC_power(double noc_tot_reads,
-                                      double noc_tot_writes) {
+void gpgpu_sim_wrapper::set_NoC_power(double noc_tot_acc) {
   p->sys.NoC[0].total_accesses =
-      noc_tot_reads * p->sys.scaling_coefficients[NOC_A] +
-      noc_tot_writes * p->sys.scaling_coefficients[NOC_A];
-  sample_perf_counters[NOC_A] = noc_tot_reads + noc_tot_writes;
+      noc_tot_acc * p->sys.scaling_coefficients[NOC_A];
+  sample_perf_counters[NOC_A] = noc_tot_acc;
 }
 
 void gpgpu_sim_wrapper::power_metrics_calculations() {
@@ -411,8 +521,12 @@ void gpgpu_sim_wrapper::power_metrics_calculations() {
   kernel_sample_count++;
 
   // Current sample power
-  double sample_power =
-      proc->rt_power.readOp.dynamic + sample_cmp_pwr[CONST_DYNAMICP];
+  double sample_power = proc->rt_power.readOp.dynamic + sample_cmp_pwr[CONSTP] +
+                        sample_cmp_pwr[STATICP];
+  // double sample_power;
+  // for(unsigned i=0; i<num_pwr_cmps; i++){
+  //   sample_power+=sample_cmp_pwr[i]; //fix for dvfs
+  // }
 
   // Average power
   // Previous + new + constant dynamic power (e.g., dynamic clocking power)
@@ -580,26 +694,92 @@ void gpgpu_sim_wrapper::update_coefficients() {
   effpower_coeff[MEM_PRE] =
       initpower_coeff[MEM_PRE] * p->sys.scaling_coefficients[MEM_PRE];
 
-  initpower_coeff[SP_ACC] =
+  double fp_coeff = proc->cores[0]->get_coefficient_fpu_accesses();
+  double sfu_coeff = proc->cores[0]->get_coefficient_sfu_accesses();
+
+  initpower_coeff[INT_ACC] =
       proc->cores[0]->get_coefficient_ialu_accesses() *
       (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-  ;
-  initpower_coeff[SFU_ACC] = proc->cores[0]->get_coefficient_sfu_accesses();
-  initpower_coeff[FPU_ACC] = proc->cores[0]->get_coefficient_fpu_accesses();
 
-  effpower_coeff[SP_ACC] =
-      initpower_coeff[SP_ACC] * p->sys.scaling_coefficients[SP_ACC];
-  effpower_coeff[SFU_ACC] =
-      initpower_coeff[SFU_ACC] * p->sys.scaling_coefficients[SFU_ACC];
-  effpower_coeff[FPU_ACC] =
-      initpower_coeff[FPU_ACC] * p->sys.scaling_coefficients[FPU_ACC];
+  if (tot_fpu_accesses != 0) {
+    initpower_coeff[FP_ACC] =
+        fp_coeff * sample_perf_counters[FP_ACC] / tot_fpu_accesses;
+    initpower_coeff[DP_ACC] =
+        fp_coeff * sample_perf_counters[DP_ACC] / tot_fpu_accesses;
+  } else {
+    initpower_coeff[FP_ACC] = 0;
+    initpower_coeff[DP_ACC] = 0;
+  }
+
+  if (tot_sfu_accesses != 0) {
+    initpower_coeff[INT_MUL24_ACC] =
+        sfu_coeff * sample_perf_counters[INT_MUL24_ACC] / tot_sfu_accesses;
+    initpower_coeff[INT_MUL32_ACC] =
+        sfu_coeff * sample_perf_counters[INT_MUL32_ACC] / tot_sfu_accesses;
+    initpower_coeff[INT_MUL_ACC] =
+        sfu_coeff * sample_perf_counters[INT_MUL_ACC] / tot_sfu_accesses;
+    initpower_coeff[INT_DIV_ACC] =
+        sfu_coeff * sample_perf_counters[INT_DIV_ACC] / tot_sfu_accesses;
+    initpower_coeff[DP_MUL_ACC] =
+        sfu_coeff * sample_perf_counters[DP_MUL_ACC] / tot_sfu_accesses;
+    initpower_coeff[DP_DIV_ACC] =
+        sfu_coeff * sample_perf_counters[DP_DIV_ACC] / tot_sfu_accesses;
+    initpower_coeff[FP_MUL_ACC] =
+        sfu_coeff * sample_perf_counters[FP_MUL_ACC] / tot_sfu_accesses;
+    initpower_coeff[FP_DIV_ACC] =
+        sfu_coeff * sample_perf_counters[FP_DIV_ACC] / tot_sfu_accesses;
+    initpower_coeff[FP_SQRT_ACC] =
+        sfu_coeff * sample_perf_counters[FP_SQRT_ACC] / tot_sfu_accesses;
+    initpower_coeff[FP_LG_ACC] =
+        sfu_coeff * sample_perf_counters[FP_LG_ACC] / tot_sfu_accesses;
+    initpower_coeff[FP_SIN_ACC] =
+        sfu_coeff * sample_perf_counters[FP_SIN_ACC] / tot_sfu_accesses;
+    initpower_coeff[FP_EXP_ACC] =
+        sfu_coeff * sample_perf_counters[FP_EXP_ACC] / tot_sfu_accesses;
+    initpower_coeff[TENSOR_ACC] =
+        sfu_coeff * sample_perf_counters[TENSOR_ACC] / tot_sfu_accesses;
+    initpower_coeff[TEX_ACC] =
+        sfu_coeff * sample_perf_counters[TEX_ACC] / tot_sfu_accesses;
+  } else {
+    initpower_coeff[INT_MUL24_ACC] = 0;
+    initpower_coeff[INT_MUL32_ACC] = 0;
+    initpower_coeff[INT_MUL_ACC] = 0;
+    initpower_coeff[INT_DIV_ACC] = 0;
+    initpower_coeff[DP_MUL_ACC] = 0;
+    initpower_coeff[DP_DIV_ACC] = 0;
+    initpower_coeff[FP_MUL_ACC] = 0;
+    initpower_coeff[FP_DIV_ACC] = 0;
+    initpower_coeff[FP_SQRT_ACC] = 0;
+    initpower_coeff[FP_LG_ACC] = 0;
+    initpower_coeff[FP_SIN_ACC] = 0;
+    initpower_coeff[FP_EXP_ACC] = 0;
+    initpower_coeff[TENSOR_ACC] = 0;
+    initpower_coeff[TEX_ACC] = 0;
+  }
+
+  effpower_coeff[INT_ACC] = initpower_coeff[INT_ACC];
+  effpower_coeff[FP_ACC] = initpower_coeff[FP_ACC];
+  effpower_coeff[DP_ACC] = initpower_coeff[DP_ACC];
+  effpower_coeff[INT_MUL24_ACC] = initpower_coeff[INT_MUL24_ACC];
+  effpower_coeff[INT_MUL32_ACC] = initpower_coeff[INT_MUL32_ACC];
+  effpower_coeff[INT_MUL_ACC] = initpower_coeff[INT_MUL_ACC];
+  effpower_coeff[INT_DIV_ACC] = initpower_coeff[INT_DIV_ACC];
+  effpower_coeff[DP_MUL_ACC] = initpower_coeff[DP_MUL_ACC];
+  effpower_coeff[DP_DIV_ACC] = initpower_coeff[DP_DIV_ACC];
+  effpower_coeff[FP_MUL_ACC] = initpower_coeff[FP_MUL_ACC];
+  effpower_coeff[FP_DIV_ACC] = initpower_coeff[FP_DIV_ACC];
+  effpower_coeff[FP_SQRT_ACC] = initpower_coeff[FP_SQRT_ACC];
+  effpower_coeff[FP_LG_ACC] = initpower_coeff[FP_LG_ACC];
+  effpower_coeff[FP_SIN_ACC] = initpower_coeff[FP_SIN_ACC];
+  effpower_coeff[FP_EXP_ACC] = initpower_coeff[FP_EXP_ACC];
+  effpower_coeff[TENSOR_ACC] = initpower_coeff[TENSOR_ACC];
+  effpower_coeff[TEX_ACC] = initpower_coeff[TEX_ACC];
 
   initpower_coeff[NOC_A] = proc->get_coefficient_noc_accesses();
   effpower_coeff[NOC_A] =
       initpower_coeff[NOC_A] * p->sys.scaling_coefficients[NOC_A];
 
-  const_dynamic_power =
-      proc->get_const_dynamic_power() / (proc->cores[0]->executionTime);
+  // const_dynamic_power=proc->get_const_dynamic_power()/(proc->cores[0]->executionTime);
 
   for (unsigned i = 0; i < num_perf_counters; i++) {
     initpower_coeff[i] /= (proc->cores[0]->executionTime);
@@ -607,11 +787,140 @@ void gpgpu_sim_wrapper::update_coefficients() {
   }
 }
 
+double gpgpu_sim_wrapper::calculate_static_power() {
+  double int_accesses =
+      initpower_coeff[INT_ACC] + initpower_coeff[INT_MUL24_ACC] +
+      initpower_coeff[INT_MUL32_ACC] + initpower_coeff[INT_MUL_ACC] +
+      initpower_coeff[INT_DIV_ACC];
+  double int_add_accesses = initpower_coeff[INT_ACC];
+  double int_mul_accesses =
+      initpower_coeff[INT_MUL24_ACC] + initpower_coeff[INT_MUL32_ACC] +
+      initpower_coeff[INT_MUL_ACC] + initpower_coeff[INT_DIV_ACC];
+  double fp_accesses = initpower_coeff[FP_ACC] + initpower_coeff[FP_MUL_ACC] +
+                       initpower_coeff[FP_DIV_ACC];
+  double dp_accesses = initpower_coeff[DP_ACC] + initpower_coeff[DP_MUL_ACC] +
+                       initpower_coeff[DP_DIV_ACC];
+  double sfu_accesses =
+      initpower_coeff[FP_SQRT_ACC] + initpower_coeff[FP_LG_ACC] +
+      initpower_coeff[FP_SIN_ACC] + initpower_coeff[FP_EXP_ACC];
+  double tensor_accesses = initpower_coeff[TENSOR_ACC];
+  double tex_accesses = initpower_coeff[TEX_ACC];
+  double total_static_power = 0.0;
+  double base_static_power = 0.0;
+  double lane_static_power = 0.0;
+  double per_active_core = (num_cores - num_idle_cores) / num_cores;
+
+  double l1_accesses = initpower_coeff[DC_RH] + initpower_coeff[DC_RM] +
+                       initpower_coeff[DC_WH] + initpower_coeff[DC_WM];
+  double l2_accesses = initpower_coeff[L2_RH] + initpower_coeff[L2_RM] +
+                       initpower_coeff[L2_WH] + initpower_coeff[L2_WM];
+  double shared_accesses = initpower_coeff[SHRD_ACC];
+
+  if (avg_threads_per_warp ==
+      0) {  // no functional unit threads, check for memory or a 'LIGHT_SM'
+    if (l1_accesses != 0.0)
+      return (p->sys.static_l1_flane * per_active_core);
+    else if (shared_accesses != 0.0)
+      return (p->sys.static_shared_flane * per_active_core);
+    else if (l2_accesses != 0.0)
+      return (p->sys.static_l2_flane * per_active_core);
+    else  // LIGHT_SM
+      return (p->sys.static_light_flane *
+              per_active_core);  // return LIGHT_SM base static power
+  }
+
+  /* using a linear model for thread divergence */
+  if ((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses != 0.0) &&
+      (sfu_accesses == 0.0) && (tensor_accesses == 0.0) &&
+      (tex_accesses == 0.0)) {
+    /* INT_FP_DP */
+    base_static_power = p->sys.static_cat3_flane;
+    lane_static_power = p->sys.static_cat3_addlane;
+  }
+
+  else if ((int_accesses != 0.0) && (fp_accesses != 0.0) &&
+           (dp_accesses == 0.0) && (sfu_accesses == 0.0) &&
+           (tensor_accesses != 0.0) && (tex_accesses == 0.0)) {
+    /* INT_FP_TENSOR */
+    base_static_power = p->sys.static_cat6_flane;
+    lane_static_power = p->sys.static_cat6_addlane;
+  }
+
+  else if ((int_accesses != 0.0) && (fp_accesses != 0.0) &&
+           (dp_accesses == 0.0) && (sfu_accesses != 0.0) &&
+           (tensor_accesses == 0.0) && (tex_accesses == 0.0)) {
+    /* INT_FP_SFU */
+    base_static_power = p->sys.static_cat4_flane;
+    lane_static_power = p->sys.static_cat4_addlane;
+  }
+
+  else if ((int_accesses != 0.0) && (fp_accesses != 0.0) &&
+           (dp_accesses == 0.0) && (sfu_accesses == 0.0) &&
+           (tensor_accesses == 0.0) && (tex_accesses != 0.0)) {
+    /* INT_FP_TEX */
+    base_static_power = p->sys.static_cat5_flane;
+    lane_static_power = p->sys.static_cat5_addlane;
+  }
+
+  else if ((int_accesses != 0.0) && (fp_accesses != 0.0) &&
+           (dp_accesses == 0.0) && (sfu_accesses == 0.0) &&
+           (tensor_accesses == 0.0) && (tex_accesses == 0.0)) {
+    /* INT_FP */
+    base_static_power = p->sys.static_cat2_flane;
+    lane_static_power = p->sys.static_cat2_addlane;
+  }
+
+  else if ((int_accesses != 0.0) && (fp_accesses == 0.0) &&
+           (dp_accesses == 0.0) && (sfu_accesses == 0.0) &&
+           (tensor_accesses == 0.0) && (tex_accesses == 0.0)) {
+    /* INT */
+    /* Seperating INT_ADD only and INT_MUL only from mix of INT instructions */
+    if ((int_add_accesses != 0.0) && (int_mul_accesses == 0.0)) {  // INT_ADD
+      base_static_power = p->sys.static_intadd_flane;
+      lane_static_power = p->sys.static_intadd_addlane;
+    } else if ((int_add_accesses == 0.0) &&
+               (int_mul_accesses != 0.0)) {  // INT_MUL
+      base_static_power = p->sys.static_intmul_flane;
+      lane_static_power = p->sys.static_intmul_addlane;
+    } else {  // INT_ADD+MUL
+      base_static_power = p->sys.static_cat1_flane;
+      lane_static_power = p->sys.static_cat1_addlane;
+    }
+  }
+
+  else if ((int_accesses == 0.0) && (fp_accesses == 0.0) &&
+           (dp_accesses == 0.0) && (sfu_accesses == 0.0) &&
+           (tensor_accesses == 0.0) && (tex_accesses == 0.0)) {
+    /* LIGHT_SM or memory only sample */
+    lane_static_power =
+        0.0;  // addlane static power is 0 for l1/l2/shared memory only accesses
+    if (l1_accesses != 0.0)
+      base_static_power = p->sys.static_l1_flane;
+    else if (shared_accesses != 0.0)
+      base_static_power = p->sys.static_shared_flane;
+    else if (l2_accesses != 0.0)
+      base_static_power = p->sys.static_l2_flane;
+    else {
+      base_static_power = p->sys.static_light_flane;
+      lane_static_power = p->sys.static_light_addlane;
+    }
+  } else {
+    base_static_power =
+        p->sys.static_geomean_flane;  // GEOMEAN except LIGHT_SM if we don't
+                                      // fall into any of the categories above
+    lane_static_power = p->sys.static_geomean_addlane;
+  }
+
+  total_static_power =
+      base_static_power + (((double)avg_threads_per_warp - 1.0) *
+                           lane_static_power);  // Linear Model
+  return (total_static_power * per_active_core);
+}
+
 void gpgpu_sim_wrapper::update_components_power() {
   update_coefficients();
 
   proc_power = proc->rt_power.readOp.dynamic;
-
   sample_cmp_pwr[IBP] =
       (proc->cores[0]->ifu->IB->rt_power.readOp.dynamic +
        proc->cores[0]->ifu->IB->rt_power.writeOp.dynamic +
@@ -641,16 +950,71 @@ void gpgpu_sim_wrapper::update_components_power() {
        (proc->cores[0]->executionTime)) *
       (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
 
-  sample_cmp_pwr[SPP] =
+  double sample_fp_pwr = (proc->cores[0]->exu->fp_u->rt_power.readOp.dynamic /
+                          (proc->cores[0]->executionTime));
+
+  double sample_sfu_pwr = (proc->cores[0]->exu->mul->rt_power.readOp.dynamic /
+                           (proc->cores[0]->executionTime));
+
+  sample_cmp_pwr[INTP] =
       (proc->cores[0]->exu->exeu->rt_power.readOp.dynamic /
        (proc->cores[0]->executionTime)) *
       (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
 
-  sample_cmp_pwr[SFUP] = (proc->cores[0]->exu->mul->rt_power.readOp.dynamic /
-                          (proc->cores[0]->executionTime));
-
-  sample_cmp_pwr[FPUP] = (proc->cores[0]->exu->fp_u->rt_power.readOp.dynamic /
-                          (proc->cores[0]->executionTime));
+  if (tot_fpu_accesses != 0) {
+    sample_cmp_pwr[FPUP] =
+        sample_fp_pwr * sample_perf_counters[FP_ACC] / tot_fpu_accesses;
+    sample_cmp_pwr[DPUP] =
+        sample_fp_pwr * sample_perf_counters[DP_ACC] / tot_fpu_accesses;
+  } else {
+    sample_cmp_pwr[FPUP] = 0;
+    sample_cmp_pwr[DPUP] = 0;
+  }
+  if (tot_sfu_accesses != 0) {
+    sample_cmp_pwr[INT_MUL24P] =
+        sample_sfu_pwr * sample_perf_counters[INT_MUL24_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[INT_MUL32P] =
+        sample_sfu_pwr * sample_perf_counters[INT_MUL32_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[INT_MULP] =
+        sample_sfu_pwr * sample_perf_counters[INT_MUL_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[INT_DIVP] =
+        sample_sfu_pwr * sample_perf_counters[INT_DIV_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[FP_MULP] =
+        sample_sfu_pwr * sample_perf_counters[FP_MUL_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[FP_DIVP] =
+        sample_sfu_pwr * sample_perf_counters[FP_DIV_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[FP_SQRTP] =
+        sample_sfu_pwr * sample_perf_counters[FP_SQRT_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[FP_LGP] =
+        sample_sfu_pwr * sample_perf_counters[FP_LG_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[FP_SINP] =
+        sample_sfu_pwr * sample_perf_counters[FP_SIN_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[FP_EXP] =
+        sample_sfu_pwr * sample_perf_counters[FP_EXP_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[DP_MULP] =
+        sample_sfu_pwr * sample_perf_counters[DP_MUL_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[DP_DIVP] =
+        sample_sfu_pwr * sample_perf_counters[DP_DIV_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[TENSORP] =
+        sample_sfu_pwr * sample_perf_counters[TENSOR_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[TEXP] =
+        sample_sfu_pwr * sample_perf_counters[TEX_ACC] / tot_sfu_accesses;
+  } else {
+    sample_cmp_pwr[INT_MUL24P] = 0;
+    sample_cmp_pwr[INT_MUL32P] = 0;
+    sample_cmp_pwr[INT_MULP] = 0;
+    sample_cmp_pwr[INT_DIVP] = 0;
+    sample_cmp_pwr[FP_MULP] = 0;
+    sample_cmp_pwr[FP_DIVP] = 0;
+    sample_cmp_pwr[FP_SQRTP] = 0;
+    sample_cmp_pwr[FP_LGP] = 0;
+    sample_cmp_pwr[FP_SINP] = 0;
+    sample_cmp_pwr[FP_EXP] = 0;
+    sample_cmp_pwr[DP_MULP] = 0;
+    sample_cmp_pwr[DP_DIVP] = 0;
+    sample_cmp_pwr[TENSORP] = 0;
+    sample_cmp_pwr[TEXP] = 0;
+  }
 
   sample_cmp_pwr[SCHEDP] = proc->cores[0]->exu->scheu->rt_power.readOp.dynamic /
                            (proc->cores[0]->executionTime);
@@ -678,26 +1042,50 @@ void gpgpu_sim_wrapper::update_components_power() {
 
   // This constant dynamic power (e.g., clock power) part is estimated via
   // regression model.
-  sample_cmp_pwr[CONST_DYNAMICP] = 0;
-  double cnst_dyn =
-      proc->get_const_dynamic_power() / (proc->cores[0]->executionTime);
-  // If the regression scaling term is greater than the recorded constant
-  // dynamic power then use the difference (other portion already added to
-  // dynamic power). Else, all the constant dynamic power is accounted for, add
-  // nothing.
-  if (p->sys.scaling_coefficients[CONST_DYNAMICN] > cnst_dyn)
-    sample_cmp_pwr[CONST_DYNAMICP] =
-        (p->sys.scaling_coefficients[CONST_DYNAMICN] - cnst_dyn);
-
-  proc_power += sample_cmp_pwr[CONST_DYNAMICP];
-
-  double sum_pwr_cmp = 0;
-  for (unsigned i = 0; i < num_pwr_cmps; i++) {
-    sum_pwr_cmp += sample_cmp_pwr[i];
+  sample_cmp_pwr[CONSTP] = 0;
+  sample_cmp_pwr[STATICP] = 0;
+  // double cnst_dyn =
+  // proc->get_const_dynamic_power()/(proc->cores[0]->executionTime);
+  // // If the regression scaling term is greater than the recorded constant
+  // dynamic power
+  // // then use the difference (other portion already added to dynamic power).
+  // Else,
+  // // all the constant dynamic power is accounted for, add nothing.
+  // if(p->sys.scaling_coefficients[constant_power] > cnst_dyn)
+  //   sample_cmp_pwr[CONSTP] =
+  //   (p->sys.scaling_coefficients[constant_power]-cnst_dyn);
+  sample_cmp_pwr[CONSTP] = p->sys.scaling_coefficients[constant_power];
+  sample_cmp_pwr[STATICP] = calculate_static_power();
+
+  if (g_dvfs_enabled) {
+    double voltage_ratio =
+        modeled_chip_voltage / p->sys.modeled_chip_voltage_ref;
+    sample_cmp_pwr[IDLE_COREP] *=
+        voltage_ratio;  // static power scaled by voltage_ratio
+    sample_cmp_pwr[STATICP] *=
+        voltage_ratio;  // static power scaled by voltage_ratio
+    for (unsigned i = 0; i < num_pwr_cmps; i++) {
+      if ((i != IDLE_COREP) && (i != STATICP)) {
+        sample_cmp_pwr[i] *=
+            voltage_ratio *
+            voltage_ratio;  // dynamic power scaled by square of voltage_ratio
+      }
+    }
+  }
+
+  proc_power += sample_cmp_pwr[CONSTP] + sample_cmp_pwr[STATICP];
+  if (!g_dvfs_enabled) {  // sanity check will fail when voltage scaling is
+                          // applied, fix later
+    double sum_pwr_cmp = 0;
+    for (unsigned i = 0; i < num_pwr_cmps; i++) {
+      sum_pwr_cmp += sample_cmp_pwr[i];
+    }
+    bool check = false;
+    check = sanity_check(sum_pwr_cmp, proc_power);
+    if (!check)
+      printf("sum_pwr_cmp %f : proc_power %f \n", sum_pwr_cmp, proc_power);
+    assert("Total Power does not equal the sum of the components\n" && (check));
   }
-  bool check = false;
-  check = sanity_check(sum_pwr_cmp, proc_power);
-  assert("Total Power does not equal the sum of the components\n" && (check));
 }
 
 void gpgpu_sim_wrapper::compute() { proc->compute(); }
@@ -722,6 +1110,15 @@ void gpgpu_sim_wrapper::print_power_kernel_stats(
                 << std::endl;
     }
 
+    powerfile << "gpu_avg_threads_per_warp = "
+              << avg_threads_per_warp_tot / (double)kernel_sample_count
+              << std::endl;
+
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_tot_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].avg << std::endl;
+    }
+
     powerfile << std::endl << "Kernel Maximum Power Data:" << std::endl;
     powerfile << "kernel_max_power = " << kernel_power.max << std::endl;
     for (unsigned i = 0; i < num_pwr_cmps; ++i) {
diff --git a/src/gpuwattch/gpgpu_sim_wrapper.h b/src/accelwattch/gpgpu_sim_wrapper.h
similarity index 68%
rename from src/gpuwattch/gpgpu_sim_wrapper.h
rename to src/accelwattch/gpgpu_sim_wrapper.h
index 00e4f0746..dd71d891f 100644
--- a/src/gpuwattch/gpgpu_sim_wrapper.h
+++ b/src/accelwattch/gpgpu_sim_wrapper.h
@@ -1,18 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy,
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy,
+// Vijay Kandiah, Nikos Hardavellas The University of British Columbia,
+// Northwestern University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -54,9 +56,35 @@ struct avg_max_min_counters {
   }
 };
 
+#ifndef COEFF_STRUCT
+#define COEFF_STRUCT
+
+struct PowerscalingCoefficients {
+  double int_coeff;
+  double int_mul_coeff;
+  double int_mul24_coeff;
+  double int_mul32_coeff;
+  double int_div_coeff;
+  double fp_coeff;
+  double dp_coeff;
+  double fp_mul_coeff;
+  double fp_div_coeff;
+  double dp_mul_coeff;
+  double dp_div_coeff;
+  double sqrt_coeff;
+  double log_coeff;
+  double sin_coeff;
+  double exp_coeff;
+  double tensor_coeff;
+  double tex_coeff;
+};
+
+#endif
+
 class gpgpu_sim_wrapper {
  public:
-  gpgpu_sim_wrapper(bool power_simulation_enabled, char* xmlfile);
+  gpgpu_sim_wrapper(bool power_simulation_enabled, char* xmlfile,
+                    int power_simulation_mode, bool dvfs_enabled);
   ~gpgpu_sim_wrapper();
 
   void init_mcpat(char* xmlfile, char* powerfile, char* power_trace_file,
@@ -64,7 +92,10 @@ class gpgpu_sim_wrapper {
                   bool power_sim_enabled, bool trace_enabled,
                   bool steady_state_enabled, bool power_per_cycle_dump,
                   double steady_power_deviation, double steady_min_period,
-                  int zlevel, double init_val, int stat_sample_freq);
+                  int zlevel, double init_val, int stat_sample_freq,
+                  int power_sim_mode, bool dvfs_enabled, unsigned clock_freq,
+                  unsigned num_shaders);
+  void init_mcpat_hw_mode(unsigned gpu_sim_cycle);
   void detect_print_steady_state(int position, double init_val);
   void close_files();
   void open_files();
@@ -72,6 +103,7 @@ class gpgpu_sim_wrapper {
   void dump();
   void print_trace_files();
   void update_components_power();
+  double calculate_static_power();
   void update_coefficients();
   void reset_counters();
   void print_power_kernel_stats(double gpu_sim_cycle, double gpu_tot_sim_cycle,
@@ -79,6 +111,7 @@ class gpgpu_sim_wrapper {
                                 const std::string& kernel_info_string,
                                 bool print_trace);
   void power_metrics_calculations();
+  void set_model_voltage(double model_voltage);
   void set_inst_power(bool clk_gated_lanes, double tot_cycles,
                       double busy_cycles, double tot_inst, double int_inst,
                       double fp_inst, double load_inst, double store_inst,
@@ -92,16 +125,31 @@ class gpgpu_sim_wrapper {
                          double write_accesses, double write_misses);
   void set_l2cache_power(double read_accesses, double read_misses,
                          double write_accesses, double write_misses);
+  void set_num_cores(double num_core);
   void set_idle_core_power(double num_idle_core);
   void set_duty_cycle_power(double duty_cycle);
   void set_mem_ctrl_power(double reads, double writes, double dram_precharge);
   void set_exec_unit_power(double fpu_accesses, double ialu_accesses,
                            double sfu_accesses);
+  void set_int_accesses(double ialu_accesses, double imul24_accesses,
+                        double imul32_accesses, double imul_accesses,
+                        double idiv_accesses);
+  void set_dp_accesses(double dpu_accesses, double dpmul_accesses,
+                       double dpdiv_accesses);
+  void set_fp_accesses(double fpu_accesses, double fpmul_accesses,
+                       double fpdiv_accesses);
+  void set_trans_accesses(double sqrt_accesses, double log_accesses,
+                          double sin_accesses, double exp_accesses);
+  void set_tensor_accesses(double tensor_accesses);
+  void set_tex_accesses(double tex_accesses);
+  void set_avg_active_threads(float active_threads);
   void set_active_lanes_power(double sp_avg_active_lane,
                               double sfu_avg_active_lane);
-  void set_NoC_power(double noc_tot_reads, double noc_tot_write);
+  void set_NoC_power(double noc_tot_acc);
   bool sanity_check(double a, double b);
 
+  PowerscalingCoefficients* get_scaling_coeffs();
+
  private:
   void print_steady_state(int position, double init_val);
 
@@ -109,8 +157,10 @@ class gpgpu_sim_wrapper {
   ParseXML* p;
   // power parameters
   double const_dynamic_power;
+  double avg_threads_per_warp_tot;
   double proc_power;
-
+  double num_cores;
+  double num_idle_cores;
   unsigned num_perf_counters;  // # of performance counters
   unsigned num_pwr_cmps;       // # of components modelled
   int kernel_sample_count;     // # of samples per kernel
@@ -140,6 +190,10 @@ class gpgpu_sim_wrapper {
   unsigned sample_start;
   double sample_val;
   double init_inst_val;
+  double tot_sfu_accesses;
+  double tot_fpu_accesses;
+  double modeled_chip_voltage;
+  unsigned avg_threads_per_warp;
   std::vector<double> samples;
   std::vector<double> samples_counter;
   std::vector<double> pwr_counter;
@@ -150,6 +204,8 @@ class gpgpu_sim_wrapper {
   char* g_metric_trace_filename;
   char* g_steady_state_tracking_filename;
   bool g_power_simulation_enabled;
+  int g_power_simulation_mode;
+  bool g_dvfs_enabled;
   bool g_steady_power_levels_enabled;
   bool g_power_trace_enabled;
   bool g_power_per_cycle_dump;
diff --git a/src/gpuwattch/gpgpu_static.xml b/src/accelwattch/gpgpu_static.xml
similarity index 100%
rename from src/gpuwattch/gpgpu_static.xml
rename to src/accelwattch/gpgpu_static.xml
diff --git a/src/gpuwattch/interconnect.cc b/src/accelwattch/interconnect.cc
similarity index 100%
rename from src/gpuwattch/interconnect.cc
rename to src/accelwattch/interconnect.cc
diff --git a/src/gpuwattch/interconnect.h b/src/accelwattch/interconnect.h
similarity index 100%
rename from src/gpuwattch/interconnect.h
rename to src/accelwattch/interconnect.h
diff --git a/src/gpuwattch/iocontrollers.cc b/src/accelwattch/iocontrollers.cc
similarity index 100%
rename from src/gpuwattch/iocontrollers.cc
rename to src/accelwattch/iocontrollers.cc
diff --git a/src/gpuwattch/iocontrollers.h b/src/accelwattch/iocontrollers.h
similarity index 100%
rename from src/gpuwattch/iocontrollers.h
rename to src/accelwattch/iocontrollers.h
diff --git a/src/gpuwattch/logic.cc b/src/accelwattch/logic.cc
similarity index 100%
rename from src/gpuwattch/logic.cc
rename to src/accelwattch/logic.cc
diff --git a/src/gpuwattch/logic.h b/src/accelwattch/logic.h
similarity index 100%
rename from src/gpuwattch/logic.h
rename to src/accelwattch/logic.h
diff --git a/src/gpuwattch/main.cc b/src/accelwattch/main.cc
similarity index 100%
rename from src/gpuwattch/main.cc
rename to src/accelwattch/main.cc
diff --git a/src/gpuwattch/makefile b/src/accelwattch/makefile
similarity index 100%
rename from src/gpuwattch/makefile
rename to src/accelwattch/makefile
diff --git a/src/gpuwattch/mcpat.mk b/src/accelwattch/mcpat.mk
similarity index 97%
rename from src/gpuwattch/mcpat.mk
rename to src/accelwattch/mcpat.mk
index a09c23b4c..ad2d6c299 100644
--- a/src/gpuwattch/mcpat.mk
+++ b/src/accelwattch/mcpat.mk
@@ -1,5 +1,5 @@
 
-OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/gpuwattch
+OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/accelwattch
 TARGET = mcpat
 SHELL = /bin/sh
 .PHONY: all depend clean
diff --git a/src/gpuwattch/mcpatXeonCore.mk b/src/accelwattch/mcpatXeonCore.mk
similarity index 100%
rename from src/gpuwattch/mcpatXeonCore.mk
rename to src/accelwattch/mcpatXeonCore.mk
diff --git a/src/gpuwattch/memoryctrl.cc b/src/accelwattch/memoryctrl.cc
similarity index 100%
rename from src/gpuwattch/memoryctrl.cc
rename to src/accelwattch/memoryctrl.cc
diff --git a/src/gpuwattch/memoryctrl.h b/src/accelwattch/memoryctrl.h
similarity index 100%
rename from src/gpuwattch/memoryctrl.h
rename to src/accelwattch/memoryctrl.h
diff --git a/src/gpuwattch/noc.cc b/src/accelwattch/noc.cc
similarity index 100%
rename from src/gpuwattch/noc.cc
rename to src/accelwattch/noc.cc
diff --git a/src/gpuwattch/noc.h b/src/accelwattch/noc.h
similarity index 100%
rename from src/gpuwattch/noc.h
rename to src/accelwattch/noc.h
diff --git a/src/gpuwattch/processor.cc b/src/accelwattch/processor.cc
similarity index 99%
rename from src/gpuwattch/processor.cc
rename to src/accelwattch/processor.cc
index fc6db463d..d5c7cdda8 100644
--- a/src/gpuwattch/processor.cc
+++ b/src/accelwattch/processor.cc
@@ -30,11 +30,13 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
- *University of Wisconsin–Madison                * Tayler Hetherington,
- *University of British Columbia         * Ahmed ElTantawy, University of
- *British Columbia             *
+ * Jingwen Leng, University of Texas, Austin
+ * Syed Gilani, University of Wisconsin–Madison
+ * Tayler Hetherington, University of British Columbia
+ * Ahmed ElTantawy, University of British Columbia
+ * Vijay Kandiah, Northwestern University
  ********************************************************************/
+
 #include "processor.h"
 #include <assert.h>
 #include <stdio.h>
@@ -118,7 +120,7 @@ Processor::Processor(ParseXML *XML_interface)
       set_pppm(pppm_t, cores[i]->clockRate * procdynp.numCore, procdynp.numCore,
                procdynp.numCore, procdynp.numCore);
       // set the exClockRate
-      exClockRate = cores[0]->clockRate * 2;  // TODO; get from XML file
+      exClockRate = cores[0]->clockRate;  // TODO; get from XML file
       // cout<<"****EX clock rate:"<<exClockRate<<endl;
       core.power = core.power + cores[i]->power * pppm_t;
       set_pppm(pppm_t, 1 / cores[i]->executionTime, procdynp.numCore,
diff --git a/src/gpuwattch/processor.h b/src/accelwattch/processor.h
similarity index 100%
rename from src/gpuwattch/processor.h
rename to src/accelwattch/processor.h
diff --git a/src/gpuwattch/quadro.xml b/src/accelwattch/quadro.xml
similarity index 100%
rename from src/gpuwattch/quadro.xml
rename to src/accelwattch/quadro.xml
diff --git a/src/gpuwattch/results/Alpha21364 b/src/accelwattch/results/Alpha21364
similarity index 100%
rename from src/gpuwattch/results/Alpha21364
rename to src/accelwattch/results/Alpha21364
diff --git a/src/gpuwattch/results/Alpha21364_90nm b/src/accelwattch/results/Alpha21364_90nm
similarity index 100%
rename from src/gpuwattch/results/Alpha21364_90nm
rename to src/accelwattch/results/Alpha21364_90nm
diff --git a/src/gpuwattch/results/Penryn b/src/accelwattch/results/Penryn
similarity index 100%
rename from src/gpuwattch/results/Penryn
rename to src/accelwattch/results/Penryn
diff --git a/src/gpuwattch/results/T1 b/src/accelwattch/results/T1
similarity index 100%
rename from src/gpuwattch/results/T1
rename to src/accelwattch/results/T1
diff --git a/src/gpuwattch/results/T1_DC_64 b/src/accelwattch/results/T1_DC_64
similarity index 100%
rename from src/gpuwattch/results/T1_DC_64
rename to src/accelwattch/results/T1_DC_64
diff --git a/src/gpuwattch/results/T1_SBT_64 b/src/accelwattch/results/T1_SBT_64
similarity index 100%
rename from src/gpuwattch/results/T1_SBT_64
rename to src/accelwattch/results/T1_SBT_64
diff --git a/src/gpuwattch/results/T1_ST_64 b/src/accelwattch/results/T1_ST_64
similarity index 100%
rename from src/gpuwattch/results/T1_ST_64
rename to src/accelwattch/results/T1_ST_64
diff --git a/src/gpuwattch/results/T2 b/src/accelwattch/results/T2
similarity index 100%
rename from src/gpuwattch/results/T2
rename to src/accelwattch/results/T2
diff --git a/src/gpuwattch/results/Xeon_core b/src/accelwattch/results/Xeon_core
similarity index 100%
rename from src/gpuwattch/results/Xeon_core
rename to src/accelwattch/results/Xeon_core
diff --git a/src/gpuwattch/results/Xeon_uncore b/src/accelwattch/results/Xeon_uncore
similarity index 100%
rename from src/gpuwattch/results/Xeon_uncore
rename to src/accelwattch/results/Xeon_uncore
diff --git a/src/gpuwattch/sharedcache.cc b/src/accelwattch/sharedcache.cc
similarity index 100%
rename from src/gpuwattch/sharedcache.cc
rename to src/accelwattch/sharedcache.cc
diff --git a/src/gpuwattch/sharedcache.h b/src/accelwattch/sharedcache.h
similarity index 100%
rename from src/gpuwattch/sharedcache.h
rename to src/accelwattch/sharedcache.h
diff --git a/src/gpuwattch/technology_xeon_core.cc b/src/accelwattch/technology_xeon_core.cc
similarity index 100%
rename from src/gpuwattch/technology_xeon_core.cc
rename to src/accelwattch/technology_xeon_core.cc
diff --git a/src/gpuwattch/version.h b/src/accelwattch/version.h
similarity index 100%
rename from src/gpuwattch/version.h
rename to src/accelwattch/version.h
diff --git a/src/gpuwattch/xmlParser.cc b/src/accelwattch/xmlParser.cc
similarity index 99%
rename from src/gpuwattch/xmlParser.cc
rename to src/accelwattch/xmlParser.cc
index 8f49b3912..780d2ad04 100644
--- a/src/gpuwattch/xmlParser.cc
+++ b/src/accelwattch/xmlParser.cc
@@ -1236,12 +1236,10 @@ static NextToken GetNextToken(XML *pXML, int *pcbToken,
       // Indicate we are dealing with text
       *pType = eTokenText;
       while ((ch = getNextChar(pXML))) {
-        if
-          XML_isSPACECHAR(ch) {
-            indexStart++;
-            break;
-          }
-        else if (ch == _CXML('/')) {
+        if XML_isSPACECHAR (ch) {
+          indexStart++;
+          break;
+        } else if (ch == _CXML('/')) {
           // If we find a slash then this maybe text or a short hand end tag
           // Peek at the next character to see it we have short hand end tag
           ch = pXML->lpXML[pXML->nIndex];
@@ -2193,15 +2191,15 @@ int XMLNode::CreateXMLStringR(XMLNodeData *pEntry, XMLSTR lpszMarker,
         nResult++;
       }
     } else
-        // If there are child nodes we need to terminate the start tag
-        if (nElementI) {
-      if (lpszMarker) lpszMarker[nResult - 1] = _CXML('>');
-      if (nFormat >= 0) {
-        if (lpszMarker) lpszMarker[nResult] = _CXML('\n');
-        nResult++;
-      }
-    } else
-      nResult--;
+      // If there are child nodes we need to terminate the start tag
+      if (nElementI) {
+        if (lpszMarker) lpszMarker[nResult - 1] = _CXML('>');
+        if (nFormat >= 0) {
+          if (lpszMarker) lpszMarker[nResult] = _CXML('\n');
+          nResult++;
+        }
+      } else
+        nResult--;
   }
 
   // Calculate the child format for when we recurse.  This is used to
diff --git a/src/gpuwattch/xmlParser.h b/src/accelwattch/xmlParser.h
similarity index 100%
rename from src/gpuwattch/xmlParser.h
rename to src/accelwattch/xmlParser.h
diff --git a/src/cuda-sim/CMakeLists.txt b/src/cuda-sim/CMakeLists.txt
new file mode 100644
index 000000000..3378b7743
--- /dev/null
+++ b/src/cuda-sim/CMakeLists.txt
@@ -0,0 +1,78 @@
+# Specify Flex and Bison target
+BISON_TARGET(ptx_parser ptx.y ${CMAKE_CURRENT_BINARY_DIR}/ptx.tab.c
+            COMPILE_FLAGS "--name-prefix=ptx_ -v -d --file-prefix=${CMAKE_CURRENT_BINARY_DIR}/ptx")
+BISON_TARGET(ptxinfo_parser ptxinfo.y ${CMAKE_CURRENT_BINARY_DIR}/ptxinfo.tab.c
+            COMPILE_FLAGS "--name-prefix=ptxinfo_ -v -d --file-prefix=${CMAKE_CURRENT_BINARY_DIR}/ptxinfo")
+FLEX_TARGET(ptx_lexer ptx.l ${CMAKE_CURRENT_BINARY_DIR}/lex.ptx_.c)
+FLEX_TARGET(ptxinfo_lexer ptxinfo.l ${CMAKE_CURRENT_BINARY_DIR}/lex.ptxinfo_.c)
+ADD_FLEX_BISON_DEPENDENCY(ptx_lexer ptx_parser)
+ADD_FLEX_BISON_DEPENDENCY(ptxinfo_lexer ptxinfo_parser)
+
+# The flex and bison are using CXX, need to set their generated files to CXX so that
+# they can be compiled and linked
+set_source_files_properties(${BISON_ptx_parser_OUTPUT_SOURCE} 
+                            ${FLEX_ptx_lexer_OUTPUTS}
+                            ${BISON_ptxinfo_parser_OUTPUT_SOURCE} 
+                            ${FLEX_ptxinfo_lexer_OUTPUTS}
+                            PROPERTIES LANGUAGE CXX)
+# Create libptxsim.a
+add_library(ptxsim STATIC
+            cuda_device_printf.cc
+            cuda_device_runtime.cc
+            cuda-sim.cc
+            instructions.cc
+            memory.cc
+            ptx_ir.cc
+            ptx_loader.cc
+            ptx_parser.cc
+            ptx_sim.cc
+            ptx-stats.cc
+            decuda_pred_table/decuda_pred_table.cc
+            ${BISON_ptx_parser_OUTPUT_SOURCE} ${FLEX_ptx_lexer_OUTPUTS}
+            ${BISON_ptxinfo_parser_OUTPUT_SOURCE} ${FLEX_ptxinfo_lexer_OUTPUTS})
+
+# Define this for all source files, though we just need it for parser
+target_compile_definitions(ptxsim PRIVATE YYDEBUG)
+target_include_directories(ptxsim PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/decuda_pred_table)
+target_include_directories(ptxsim PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+target_include_directories(ptxsim PRIVATE ${CMAKE_BINARY_DIR})
+
+# ptxsim need buildstring
+add_dependencies(ptxsim gen_build_string)
+
+# Create instructions.h using custom command
+add_custom_target(gen_instructions_h DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/instructions.h)
+add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND chmod +w ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND echo "// DO NOT EDIT THIS FILE! IT IS AUTOMATICALLY GENERATED BY THE MAKEFILE (see target for instructions.h)" > ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND echo "#include \"ptx_ir.h\"" >> ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND echo "#ifndef instructions_h_included" >> ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND echo "#define instructions_h_included" >> ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/instructions.cc | grep "_impl(" | sed "s/{.*//" | sed "s/$/;/"  >> ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND echo "#endif" >> ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    # COMMAND chmod -w ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/instructions.cc
+    VERBATIM
+)
+add_dependencies(ptxsim gen_instructions_h)
+
+# Create ptx_parser_decode.def using custom command
+add_custom_target(gen_ptx_parser_decode DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/ptx_parser_decode.def)
+if(UNIX)
+    add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ptx_parser_decode.def
+        COMMAND cat ${CMAKE_CURRENT_BINARY_DIR}/ptx.tab.h | grep "=" | sed "s/^[ ]\\+//" | sed -E "s/\\s+\\/\\*.+\\*\\///" | sed "s/[=,]//g" | sed "s/\\([_A-Z1-9]\\+\\)[ ]\\+\\([0-9]\\+\\)/\\1 \\1/" | sed "s/^/DEF(/" | sed "s/ /,\"/" | sed "s/$/\")/" | sed "/YYerror/d;/YYEOF/d;/YYEMPTY/d;/YYUNDEF/d;" > ${CMAKE_CURRENT_BINARY_DIR}/ptx_parser_decode.def
+        DEPENDS ${BISON_ptx_parser_OUTPUTS}
+        VERBATIM
+    )
+else()
+    add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ptx_parser_decode.def
+        COMMAND cat ${CMAKE_CURRENT_BINARY_DIR}/ptx.tab.h | grep "=" | sed -E "s/^ +//" | sed -E "s/\\s+\\/\\*.+\\*\\///" | sed "s/[=,]//g" | sed -E "s/([_A-Z1-9]+).*/\\1 \\1/" | sed "s/^/DEF(/" | sed "s/ /,\"/" | sed "s/$/\")/" | sed "/YYerror/d;/YYEOF/d;/YYEMPTY/d;/YYUNDEF/d;" > ${CMAKE_CURRENT_BINARY_DIR}/ptx_parser_decode.def
+        DEPENDS ${BISON_ptx_parser_OUTPUTS}
+        VERBATIM
+    )
+endif()
+add_dependencies(ptxsim gen_ptx_parser_decode)
diff --git a/src/cuda-sim/Makefile b/src/cuda-sim/Makefile
index 85d1c8c01..541cf8f26 100644
--- a/src/cuda-sim/Makefile
+++ b/src/cuda-sim/Makefile
@@ -91,16 +91,16 @@ $(OUTPUT_DIR)/lex.ptxinfo_.o: $(OUTPUT_DIR)/lex.ptxinfo_.c $(OUTPUT_DIR)/ptxinfo
 	$(CPP) -c $(CXX_OPT) $(OUTPUT_DIR)/lex.ptxinfo_.c -o $(OUTPUT_DIR)/lex.ptxinfo_.o
 
 $(OUTPUT_DIR)/ptx.tab.c: ptx.y
-	bison --name-prefix=ptx_ -v -d ptx.y --file-prefix=$(OUTPUT_DIR)/ptx
+	bison --name-prefix=ptx_ -v -d ptx.y --file-prefix=$(OUTPUT_DIR)/ptx 2> /dev/null
 
 $(OUTPUT_DIR)/ptxinfo.tab.c: ptxinfo.y
-	bison --name-prefix=ptxinfo_ -v -d ptxinfo.y --file-prefix=$(OUTPUT_DIR)/ptxinfo
+	bison --name-prefix=ptxinfo_ -v -d ptxinfo.y --file-prefix=$(OUTPUT_DIR)/ptxinfo 2> /dev/null
 
 $(OUTPUT_DIR)/lex.ptx_.c: ptx.l
-	flex --outfile=$(OUTPUT_DIR)/lex.ptx_.c ptx.l 
+	flex --outfile=$(OUTPUT_DIR)/lex.ptx_.c ptx.l 2> /dev/null
 
 $(OUTPUT_DIR)/lex.ptxinfo_.c: ptxinfo.l
-	flex --outfile=$(OUTPUT_DIR)/lex.ptxinfo_.c ptxinfo.l 
+	flex --outfile=$(OUTPUT_DIR)/lex.ptxinfo_.c ptxinfo.l 2> /dev/null
 
 clean:
 	rm -f *~ *.o *.gcda *.gcno *.gcov libgpgpu_ptx_sim.a \
@@ -129,9 +129,9 @@ $(OUTPUT_DIR)/instructions.h: instructions.cc
 
 $(OUTPUT_DIR)/ptx_parser_decode.def: $(OUTPUT_DIR)/ptx.tab.c
 ifeq ($(shell uname),Linux)
-	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed 's/^[ ]\+//' | sed 's/[=,]//g' | sed 's/\([_A-Z1-9]\+\)[ ]\+\([0-9]\+\)/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' > $(OUTPUT_DIR)/ptx_parser_decode.def
+	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed 's/^[ ]\+//' | sed -E 's/\s+\/\*.+\*\///' | sed 's/[=,]//g' | sed 's/\([_A-Z1-9]\+\)[ ]\+\([0-9]\+\)/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' | sed '/YYerror/d;/YYEOF/d;/YYEMPTY/d;/YYUNDEF/d;'> $(OUTPUT_DIR)/ptx_parser_decode.def
 else
-	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed -E 's/^ +//' | sed 's/[=,]//g' | sed -E 's/([_A-Z1-9]+).*/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' > $(OUTPUT_DIR)/ptx_parser_decode.def
+	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed -E 's/^ +//' | sed -E 's/\s+\/\*.+\*\///' | sed 's/[=,]//g' | sed -E 's/([_A-Z1-9]+).*/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' | sed '/YYerror/d;/YYEOF/d;/YYEMPTY/d;/YYUNDEF/d;' > $(OUTPUT_DIR)/ptx_parser_decode.def
 endif
 
 $(OUTPUT_DIR)/instructions.o: $(OUTPUT_DIR)/instructions.h $(OUTPUT_DIR)/ptx.tab.c
diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc
index 71f0703ac..2fd90c0e5 100644
--- a/src/cuda-sim/cuda-sim.cc
+++ b/src/cuda-sim/cuda-sim.cc
@@ -1,19 +1,22 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
-// George L. Yuan, Jimmy Kwa
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
+// George L. Yuan, Jimmy Kwa, Vijay Kandiah, Nikos Hardavellas,
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -543,7 +546,7 @@ void gpgpu_t::gpu_memset(size_t dst_start_addr, int c, size_t count) {
 void cuda_sim::ptx_print_insn(address_type pc, FILE *fp) {
   std::map<unsigned, function_info *>::iterator f = g_pc_to_finfo.find(pc);
   if (f == g_pc_to_finfo.end()) {
-    fprintf(fp, "<no instruction at address 0x%x>", pc);
+    fprintf(fp, "<no instruction at address 0x%llx>", pc);
     return;
   }
   function_info *finfo = f->second;
@@ -557,7 +560,7 @@ std::string cuda_sim::ptx_get_insn_str(address_type pc) {
 #define STR_SIZE 255
     char buff[STR_SIZE];
     buff[STR_SIZE - 1] = '\0';
-    snprintf(buff, STR_SIZE, "<no instruction at address 0x%x>", pc);
+    snprintf(buff, STR_SIZE, "<no instruction at address 0x%llx>", pc);
     return std::string(buff);
   }
   function_info *finfo = f->second;
@@ -588,21 +591,26 @@ void ptx_instruction::set_fp_or_int_archop() {
       oprnd_type = INT_OP;
   }
 }
+
 void ptx_instruction::set_mul_div_or_other_archop() {
   sp_op = OTHER_OP;
   if ((m_opcode != MEMBAR_OP) && (m_opcode != SSY_OP) && (m_opcode != BRA_OP) &&
       (m_opcode != BAR_OP) && (m_opcode != EXIT_OP) && (m_opcode != NOP_OP) &&
       (m_opcode != RETP_OP) && (m_opcode != RET_OP) && (m_opcode != CALLP_OP) &&
       (m_opcode != CALL_OP)) {
-    if (get_type() == F32_TYPE || get_type() == F64_TYPE ||
-        get_type() == FF64_TYPE) {
+    if (get_type() == F64_TYPE || get_type() == FF64_TYPE) {
       switch (get_opcode()) {
         case MUL_OP:
         case MAD_OP:
-          sp_op = FP_MUL_OP;
+        case FMA_OP:
+          sp_op = DP_MUL_OP;
           break;
         case DIV_OP:
-          sp_op = FP_DIV_OP;
+        case REM_OP:
+          sp_op = DP_DIV_OP;
+          break;
+        case RCP_OP:
+          sp_op = DP_DIV_OP;
           break;
         case LG2_OP:
           sp_op = FP_LG_OP;
@@ -611,9 +619,44 @@ void ptx_instruction::set_mul_div_or_other_archop() {
         case SQRT_OP:
           sp_op = FP_SQRT_OP;
           break;
+        case SIN_OP:
+        case COS_OP:
+          sp_op = FP_SIN_OP;
+          break;
+        case EX2_OP:
+          sp_op = FP_EXP_OP;
+          break;
+        case MMA_OP:
+          sp_op = TENSOR__OP;
+          break;
+        case TEX_OP:
+          sp_op = TEX__OP;
+          break;
+        default:
+          if ((op == DP_OP) || (op == ALU_OP)) sp_op = DP___OP;
+          break;
+      }
+    } else if (get_type() == F16_TYPE || get_type() == F32_TYPE) {
+      switch (get_opcode()) {
+        case MUL_OP:
+        case MAD_OP:
+        case FMA_OP:
+          sp_op = FP_MUL_OP;
+          break;
+        case DIV_OP:
+        case REM_OP:
+          sp_op = FP_DIV_OP;
+          break;
         case RCP_OP:
           sp_op = FP_DIV_OP;
           break;
+        case LG2_OP:
+          sp_op = FP_LG_OP;
+          break;
+        case RSQRT_OP:
+        case SQRT_OP:
+          sp_op = FP_SQRT_OP;
+          break;
         case SIN_OP:
         case COS_OP:
           sp_op = FP_SIN_OP;
@@ -621,8 +664,14 @@ void ptx_instruction::set_mul_div_or_other_archop() {
         case EX2_OP:
           sp_op = FP_EXP_OP;
           break;
+        case MMA_OP:
+          sp_op = TENSOR__OP;
+          break;
+        case TEX_OP:
+          sp_op = TEX__OP;
+          break;
         default:
-          if ((op == ALU_OP) || (op == TENSOR_CORE_OP)) sp_op = FP__OP;
+          if ((op == SP_OP) || (op == ALU_OP)) sp_op = FP__OP;
           break;
       }
     } else {
@@ -633,6 +682,7 @@ void ptx_instruction::set_mul_div_or_other_archop() {
           break;
         case MUL_OP:
         case MAD_OP:
+        case FMA_OP:
           if (get_type() == U32_TYPE || get_type() == S32_TYPE ||
               get_type() == B32_TYPE)
             sp_op = INT_MUL32_OP;
@@ -640,10 +690,17 @@ void ptx_instruction::set_mul_div_or_other_archop() {
             sp_op = INT_MUL_OP;
           break;
         case DIV_OP:
+        case REM_OP:
           sp_op = INT_DIV_OP;
           break;
+        case MMA_OP:
+          sp_op = TENSOR__OP;
+          break;
+        case TEX_OP:
+          sp_op = TEX__OP;
+          break;
         default:
-          if ((op == ALU_OP)) sp_op = INT__OP;
+          if ((op == INTP_OP) || (op == ALU_OP)) sp_op = INT__OP;
           break;
       }
     }
@@ -880,6 +937,7 @@ void ptx_instruction::set_opcode_and_latency() {
     case MAD_OP:
     case MADC_OP:
     case MADP_OP:
+    case FMA_OP:
       // MAD latency
       switch (get_type()) {
         case F32_TYPE:
@@ -903,7 +961,20 @@ void ptx_instruction::set_opcode_and_latency() {
           break;
       }
       break;
+    case MUL24_OP:  // MUL24 is performed on mul32 units (with additional
+                    // instructions for bitmasking) on devices with compute
+                    // capability >1.x
+      latency = int_latency[2] + 1;
+      initiation_interval = int_init[2] + 1;
+      op = INTP_OP;
+      break;
+    case MAD24_OP:
+      latency = int_latency[3] + 1;
+      initiation_interval = int_init[3] + 1;
+      op = INTP_OP;
+      break;
     case DIV_OP:
+    case REM_OP:
       // Floating point only
       op = SFU_OP;
       switch (get_type()) {
@@ -1234,7 +1305,12 @@ void function_info::add_param_name_type_size(unsigned index, std::string name,
 void function_info::add_param_data(unsigned argn,
                                    struct gpgpu_ptx_sim_arg *args) {
   const void *data = args->m_start;
-
+  if (g_debug_execution >= 3) {
+    if (args->m_nbytes == 4)
+      printf("ADD_PARAM_DATA %d\n", *((uint32_t *)data));
+    else
+      printf("ADD_PARAM_DATA %p\n", *((void **)data));
+  }
   bool scratchpad_memory_param =
       false;  // Is this parameter in CUDA shared memory or OpenCL local memory
 
@@ -1304,7 +1380,7 @@ void function_info::add_param_data(unsigned argn,
       unsigned num_bits = 8 * args->m_nbytes;
       printf(
           "GPGPU-Sim PTX: deferred allocation of shared region for \"%s\" from "
-          "0x%x to 0x%x (shared memory space)\n",
+          "0x%llx to 0x%llx (shared memory space)\n",
           p->name().c_str(), m_symtab->get_shared_next(),
           m_symtab->get_shared_next() + num_bits / 8);
       fflush(stdout);
@@ -1435,7 +1511,7 @@ void function_info::list_param(FILE *fout) const {
     std::string name = p.get_name();
     symbol *param = m_symtab->lookup(name.c_str());
     addr_t param_addr = param->get_address();
-    fprintf(fout, "%s: %#08x\n", name.c_str(), param_addr);
+    fprintf(fout, "%s: %#08llx\n", name.c_str(), param_addr);
   }
   fflush(fout);
 }
@@ -1463,9 +1539,13 @@ void function_info::ptx_jit_config(
   std::string filename_c(filename + "_c");
   snprintf(buff, 1024, "c++filt %s > %s", get_name().c_str(),
            filename_c.c_str());
-  assert(system(buff) != NULL);
+  assert(system(buff) != 0);
   FILE *fp = fopen(filename_c.c_str(), "r");
-  fgets(buff, 1024, fp);
+  char *ptr = fgets(buff, 1024, fp);
+  if (ptr == NULL) {
+    printf("can't read file %s \n", filename_c.c_str());
+    assert(0);
+  }
   fclose(fp);
   std::string fn(buff);
   size_t pos1, pos2;
@@ -1671,6 +1751,17 @@ static unsigned get_tex_datasize(const ptx_instruction *pI,
                                  ptx_thread_info *thread) {
   const operand_info &src1 = pI->src1();  // the name of the texture
   std::string texname = src1.name();
+  // If indirect access, use register's value as address
+  // to find the symbol
+  if (src1.is_reg()) {
+    const operand_info &dst = pI->dst();
+    ptx_reg_t src1_data =
+        thread->get_operand_value(src1, dst, pI->get_type(), thread, 1);
+    addr_t sym_addr = src1_data.u64;
+    symbol *texRef = thread->get_symbol_table()->lookup_by_addr(sym_addr);
+    assert(texRef != NULL);
+    texname = texRef->name();
+  }
 
   /*
     For programs with many streams, textures can be bound and unbound
@@ -1809,7 +1900,7 @@ void ptx_thread_info::ptx_exec_inst(warp_inst_t &inst, unsigned lane_id) {
       dim3 tid = get_tid();
       printf(
           "%u [thd=%u][i=%u] : ctaid=(%u,%u,%u) tid=(%u,%u,%u) icount=%u "
-          "[pc=%u] (%s:%u - %s)  [0x%llx]\n",
+          "[pc=%llu] (%s:%u - %s)  [0x%llx]\n",
           m_gpu->gpgpu_ctx->func_sim->g_ptx_sim_num_insn, get_uid(), pI->uid(),
           ctaid.x, ctaid.y, ctaid.z, tid.x, tid.y, tid.z, get_icount(), pc,
           pI->source_file(), pI->source_line(), pI->get_source(),
@@ -2210,15 +2301,24 @@ void cuda_sim::gpgpu_ptx_sim_memcpy_symbol(const char *hostVar, const void *src,
     sym_name = g->second;
     mem_region = global_space;
   }
-  if (g_globals.find(hostVar) != g_globals.end()) {
-    found_sym = true;
-    sym_name = hostVar;
-    mem_region = global_space;
-  }
-  if (g_constants.find(hostVar) != g_constants.end()) {
-    found_sym = true;
-    sym_name = hostVar;
-    mem_region = const_space;
+
+  // Weili: Only attempt to find symbol as it is a string
+  // if we could not find it in previously registered variable.
+  // This will avoid constructing std::string() from hostVar address
+  // where it is not a string as
+  // Use of a string naming a variable as the symbol parameter was deprecated in
+  // CUDA 4.1 and removed in CUDA 5.0.
+  if (!found_sym) {
+    if (g_globals.find(hostVar) != g_globals.end()) {
+      found_sym = true;
+      sym_name = hostVar;
+      mem_region = global_space;
+    }
+    if (g_constants.find(hostVar) != g_constants.end()) {
+      found_sym = true;
+      sym_name = hostVar;
+      mem_region = const_space;
+    }
   }
 
   if (!found_sym) {
@@ -2308,7 +2408,7 @@ void cuda_sim::read_sim_environment_variables() {
         "%s\n",
         dbg_pc);
     fflush(stdout);
-    sscanf(dbg_pc, "%d", &g_debug_pc);
+    sscanf(dbg_pc, "%llu", &g_debug_pc);
   }
 
 #if CUDART_VERSION > 1010
diff --git a/src/cuda-sim/cuda_device_runtime.cc b/src/cuda-sim/cuda_device_runtime.cc
index 4a99c1cbb..8ed90bcc2 100644
--- a/src/cuda-sim/cuda_device_runtime.cc
+++ b/src/cuda-sim/cuda_device_runtime.cc
@@ -36,7 +36,7 @@ void cuda_device_runtime::gpgpusim_cuda_getParameterBufferV2(
   unsigned n_args = target_func->num_args();
   assert(n_args == 4);
 
-  function_info *child_kernel_entry;
+  function_info *child_kernel_entry = NULL;
   struct dim3 grid_dim, block_dim;
   unsigned int shared_mem;
 
@@ -258,7 +258,7 @@ void cuda_device_runtime::gpgpusim_cuda_streamCreateWithFlags(
   assert(n_args == 2);
 
   size_t generic_pStream_addr;
-  addr_t pStream_addr;
+  addr_t pStream_addr = 0;
   unsigned int flags;
   for (unsigned arg = 0; arg < n_args; arg++) {
     const operand_info &actual_param_op =
diff --git a/src/cuda-sim/instructions.cc b/src/cuda-sim/instructions.cc
index 8936fa80e..843bf0ba7 100644
--- a/src/cuda-sim/instructions.cc
+++ b/src/cuda-sim/instructions.cc
@@ -1,19 +1,22 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// Jimmy Kwa, George L. Yuan
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
+// Jimmy Kwa, George L. Yuan, Vijay Kandiah, Nikos Hardavellas,
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -26,6 +29,7 @@
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
+
 #include "instructions.h"
 #include "half.h"
 #include "half.hpp"
@@ -166,8 +170,9 @@ void inst_not_implemented(const ptx_instruction *pI);
 ptx_reg_t srcOperandModifiers(ptx_reg_t opData, operand_info opInfo,
                               operand_info dstInfo, unsigned type,
                               ptx_thread_info *thread);
-                              
-void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, int op_code);
+
+void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread,
+                           int op_code);
 
 void sign_extend(ptx_reg_t &data, unsigned src_size, const operand_info &dst);
 
@@ -1711,40 +1716,50 @@ void bfi_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   }
   thread->set_operand_value(dst, data, i_type, thread, pI);
 }
-void bfind_impl(const ptx_instruction *pI, ptx_thread_info *thread)
-{
-  const operand_info &dst  = pI->dst();
+void bfind_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
+  const operand_info &dst = pI->dst();
   const operand_info &src1 = pI->src1();
   const unsigned i_type = pI->get_type();
 
-  const ptx_reg_t src1_data = thread->get_operand_value(src1, dst, i_type, thread, 1);
-  const int msb = ( i_type == U32_TYPE || i_type == S32_TYPE) ? 31 : 63;
+  const ptx_reg_t src1_data =
+      thread->get_operand_value(src1, dst, i_type, thread, 1);
+  const int msb = (i_type == U32_TYPE || i_type == S32_TYPE) ? 31 : 63;
 
   unsigned long a = 0;
-  switch (i_type)
-  {
-    case S32_TYPE: a = src1_data.s32; break;
-    case U32_TYPE: a = src1_data.u32; break;
-    case S64_TYPE: a = src1_data.s64; break;
-    case U64_TYPE: a = src1_data.u64; break;
-    default: assert(false); abort();
+  switch (i_type) {
+    case S32_TYPE:
+      a = src1_data.s32;
+      break;
+    case U32_TYPE:
+      a = src1_data.u32;
+      break;
+    case S64_TYPE:
+      a = src1_data.s64;
+      break;
+    case U64_TYPE:
+      a = src1_data.u64;
+      break;
+    default:
+      assert(false);
+      abort();
   }
 
   // negate negative signed inputs
-  if ( ( i_type == S32_TYPE || i_type == S64_TYPE ) && ( a & ( 1 << msb ) ) ) {
-      a = ~a;
+  if ((i_type == S32_TYPE || i_type == S64_TYPE) && (a & (1 << msb))) {
+    a = ~a;
   }
   uint32_t d_data = 0xffffffff;
   for (uint32_t i = msb; i >= 0; i--) {
-      if (a & (1<<i))  { d_data = i; break; }
+    if (a & (1 << i)) {
+      d_data = i;
+      break;
+    }
   }
 
   // if (.shiftamt && d != 0xffffffff)  { d = msb - d; }
 
   // store d
   thread->set_operand_value(dst, d_data, U32_TYPE, thread, pI);
-
-
 }
 
 void bra_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
@@ -1934,7 +1949,7 @@ void mma_impl(const ptx_instruction *pI, core_t *core, warp_inst_t inst) {
             hex_val = (v[k / 2].s64 & 0xffff);
           else
             hex_val = ((v[k / 2].s64 & 0xffff0000) >> 16);
-          nw_v[k].f16 = *((half *)&hex_val);
+          nw_v[k].f16 = *(reinterpret_cast<half *>(hex_val));
         }
       }
       if (!((operand_num == 3) && (type2 == F32_TYPE))) {
@@ -3966,7 +3981,7 @@ void mad_def(const ptx_instruction *pI, ptx_thread_info *thread,
           fesetround(FE_TOWARDZERO);
           break;
         default:
-          assert(0);
+          // assert(0);
           break;
       }
       d.f32 = a.f32 * b.f32 + c.f32;
@@ -4312,11 +4327,8 @@ void mul_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
     case S64_TYPE:
       t.s64 = a.s64 * b.s64;
       assert(!pI->is_wide());
-      assert(!pI->is_hi());
-      if (pI->is_lo())
-        d.s64 = t.s64;
-      else
-        assert(0);
+      // assert(!pI->is_hi());
+      d.s64 = t.s64;
       break;
     case U16_TYPE:
       t.u32 = ((unsigned)a.u16) * ((unsigned)b.u16);
@@ -5429,6 +5441,38 @@ void shfl_impl(const ptx_instruction *pI, core_t *core, warp_inst_t inst) {
   }
 }
 
+void shf_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
+  ptx_reg_t a, b, c, d;
+  const operand_info &dst = pI->dst();
+  const operand_info &src1 = pI->src1();
+  const operand_info &src2 = pI->src2();
+  const operand_info &src3 = pI->src3();
+
+  // Only b32 is allowed
+  unsigned i_type = pI->get_type();
+  a = thread->get_operand_value(src1, dst, i_type, thread, 1);
+  b = thread->get_operand_value(src2, dst, i_type, thread, 1);
+  c = thread->get_operand_value(src3, dst, i_type, thread, 1);
+
+  if (i_type != B32_TYPE)
+    printf("Only the b32 data_type is allowed per the ISA\n");
+
+  unsigned clamp_mode = pI->clamp_mode();
+  unsigned n = c.u32 & 0x1f;
+  if (clamp_mode) {
+    if (c.u32 < 32)
+      n = c;
+    else
+      n = 32;
+  }
+  if (pI->left_mode())
+    d.u32 = (b.u32 << n) | (a.u32 >> (32 - n));
+  else
+    d.u32 = (b.u32 << (32 - n)) | (a.u32 >> n);
+
+  thread->set_operand_value(dst, d, i_type, thread, pI);
+}
+
 void shl_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   ptx_reg_t a, b, d;
   const operand_info &dst = pI->dst();
@@ -6011,6 +6055,17 @@ void tex_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
                    // to be fetched
 
   std::string texname = src1.name();
+  // If indirect access, use register's value as address
+  // to find the symbol
+  if (src1.is_reg()) {
+    ptx_reg_t src1_data =
+        thread->get_operand_value(src1, dst, pI->get_type(), thread, 1);
+    addr_t sym_addr = src1_data.u64;
+    symbol *texRef = thread->get_symbol_table()->lookup_by_addr(sym_addr);
+    assert(texRef != NULL);
+    texname = texRef->name();
+  }
+
   unsigned to_type = pI->get_type();
   unsigned c_type = pI->get_type2();
   fflush(stdout);
@@ -6339,12 +6394,10 @@ void vmad_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
 #define VMAX 0
 #define VMIN 1
 
-void vmax_impl(const ptx_instruction *pI, ptx_thread_info *thread)
-{
-   video_mem_instruction(pI, thread, VMAX);
+void vmax_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
+  video_mem_instruction(pI, thread, VMAX);
 }
-void vmin_impl(const ptx_instruction *pI, ptx_thread_info *thread)
-{
+void vmin_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   video_mem_instruction(pI, thread, VMIN);
 }
 void vset_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
@@ -6440,12 +6493,12 @@ void vote_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   }
 }
 
-void activemask_impl( const ptx_instruction *pI, ptx_thread_info *thread )
-{
+void activemask_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   active_mask_t l_activemask_bitset = pI->get_warp_active_mask();
-  uint32_t l_activemask_uint = static_cast<uint32_t>(l_activemask_bitset.to_ulong());
+  uint32_t l_activemask_uint =
+      static_cast<uint32_t>(l_activemask_bitset.to_ulong());
 
-  const operand_info &dst  = pI->dst();
+  const operand_info &dst = pI->dst();
   thread->set_operand_value(dst, l_activemask_uint, U32_TYPE, thread, pI);
 }
 
@@ -6527,12 +6580,12 @@ ptx_reg_t srcOperandModifiers(ptx_reg_t opData, operand_info opInfo,
   return result;
 }
 
-void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, int op_code)
-{
-  const operand_info &dst  = pI->dst(); // d
-  const operand_info &src1 = pI->src1(); // a
-  const operand_info &src2 = pI->src2(); // b
-  const operand_info &src3 = pI->src3(); // c
+void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread,
+                           int op_code) {
+  const operand_info &dst = pI->dst();    // d
+  const operand_info &src1 = pI->src1();  // a
+  const operand_info &src2 = pI->src2();  // b
+  const operand_info &src3 = pI->src3();  // c
 
   const unsigned i_type = pI->get_type();
 
@@ -6557,19 +6610,18 @@ void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, i
   auto option = options.begin();
   assert(*option == ATOMIC_MAX || *option == ATOMIC_MIN);
 
-  switch ( i_type ) {
+  switch (i_type) {
     case S32_TYPE: {
       // assert all operands are S32_TYPE:
       scalar_type = pI->get_scalar_type();
-      for (std::list<int>::iterator scalar = scalar_type.begin(); scalar != scalar_type.end(); scalar++)
-      {
+      for (std::list<int>::iterator scalar = scalar_type.begin();
+           scalar != scalar_type.end(); scalar++) {
         assert(*scalar == S32_TYPE);
       }
       assert(scalar_type.size() == 3);
       scalar_type.clear();
 
-      switch (op_code)
-      {
+      switch (op_code) {
         case VMAX:
           data.s32 = MY_MAX_I(ta.s32, tb.s32);
           break;
@@ -6580,26 +6632,23 @@ void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, i
           assert(0);
       }
 
-      switch (*option)
-      {
+      switch (*option) {
         case ATOMIC_MAX:
           data.s32 = MY_MAX_I(data.s32, c.s32);
-        break;
+          break;
         case ATOMIC_MIN:
           data.s32 = MY_MIN_I(data.s32, c.s32);
-        break;
+          break;
         default:
-          assert(0); // not yet implemented
+          assert(0);  // not yet implemented
       }
       break;
-
     }
     default:
-      assert(0); // not yet implemented
+      assert(0);  // not yet implemented
   }
 
   thread->set_operand_value(dst, data, i_type, thread, pI);
 
   return;
 }
-
diff --git a/src/cuda-sim/memory.cc b/src/cuda-sim/memory.cc
index 132383780..036badaf1 100644
--- a/src/cuda-sim/memory.cc
+++ b/src/cuda-sim/memory.cc
@@ -109,11 +109,11 @@ void memory_space_impl<BSIZE>::read_single_block(mem_addr_t blk_idx,
   if ((addr + length) > (blk_idx + 1) * BSIZE) {
     printf(
         "GPGPU-Sim PTX: ERROR * access to memory \'%s\' is unaligned : "
-        "addr=0x%x, length=%zu\n",
+        "addr=0x%llx, length=%zu\n",
         m_name.c_str(), addr, length);
     printf(
-        "GPGPU-Sim PTX: (addr+length)=0x%lx > 0x%x=(index+1)*BSIZE, "
-        "index=0x%x, BSIZE=0x%x\n",
+        "GPGPU-Sim PTX: (addr+length)=0x%llx > 0x%llx=(index+1)*BSIZE, "
+        "index=0x%llx, BSIZE=0x%x\n",
         (addr + length), (blk_idx + 1) * BSIZE, blk_idx, BSIZE);
     throw 1;
   }
@@ -169,7 +169,7 @@ void memory_space_impl<BSIZE>::print(const char *format, FILE *fout) const {
   typename map_t::const_iterator i_page;
 
   for (i_page = m_data.begin(); i_page != m_data.end(); ++i_page) {
-    fprintf(fout, "%s %08x:", m_name.c_str(), i_page->first);
+    fprintf(fout, "%s %08llx:", m_name.c_str(), i_page->first);
     i_page->second.print(format, fout);
   }
 }
diff --git a/src/cuda-sim/opcodes.def b/src/cuda-sim/opcodes.def
index f5bf156e2..83a23ea77 100644
--- a/src/cuda-sim/opcodes.def
+++ b/src/cuda-sim/opcodes.def
@@ -103,6 +103,7 @@ OP_DEF(SELP_OP,selp_impl,"selp",1,1)
 OP_DEF(SETP_OP,setp_impl,"setp",1,1)
 OP_DEF(SET_OP,set_impl,"set",1,1)
 OP_W_DEF(SHFL_OP,shfl_impl,"shfl",1,10)
+OP_DEF(SHF_OP,shf_impl,"shf",1,1)
 OP_DEF(SHL_OP,shl_impl,"shl",1,1)
 OP_DEF(SHR_OP,shr_impl,"shr",1,1)
 OP_DEF(SIN_OP,sin_impl,"sin",1,4)
diff --git a/src/cuda-sim/ptx.l b/src/cuda-sim/ptx.l
index 675404597..0810ef6e2 100644
--- a/src/cuda-sim/ptx.l
+++ b/src/cuda-sim/ptx.l
@@ -1,32 +1,35 @@
 /*
-Copyright (c) 2009-2011, Tor M. Aamodt
-The University of British Columbia
+Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas, 
+Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+The University of British Columbia, Northwestern University, Purdue University
 All rights reserved.
-
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
-Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-Redistributions in binary form must reproduce the above copyright notice, this
-list of conditions and the following disclaimer in the documentation and/or
-other materials provided with the distribution.
-Neither the name of The University of British Columbia nor the names of its
-contributors may be used to endorse or promote products derived from this
-software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of The University of British Columbia, Northwestern 
+   University nor the names of their contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
 */
 
+
 %option nounput
 %option noyywrap
 %option yylineno
@@ -69,6 +72,7 @@ andn	TC; yylval->int_value = ANDN_OP; return OPCODE;
 atom	TC; yylval->int_value = ATOM_OP; return OPCODE;
 bar.warp 	TC; yylval->int_value = NOP_OP; return OPCODE;
 bar 	TC; yylval->int_value = BAR_OP; return OPCODE;
+barrier	TC; yylval->int_value = BAR_OP; return OPCODE;
 bfe     TC; yylval->int_value = BFE_OP; return OPCODE;
 bfi     TC; yylval->int_value = BFI_OP; return OPCODE;
 bfind   TC; yylval->int_value = BFIND_OP; return OPCODE;
@@ -130,6 +134,7 @@ selp	TC; yylval->int_value = SELP_OP; return OPCODE;
 setp    TC; yylval->int_value = SETP_OP; return OPCODE;
 set	TC; yylval->int_value = SET_OP; return OPCODE;
 shfl	TC; yylval->int_value = SHFL_OP; return OPCODE;
+shf	TC; yylval->int_value = SHF_OP; return OPCODE;
 shl     TC; yylval->int_value = SHL_OP; return OPCODE;
 shr     TC; yylval->int_value = SHR_OP; return OPCODE;
 sin	TC; yylval->int_value = SIN_OP; return OPCODE;
@@ -167,14 +172,22 @@ breakaddr  TC; yylval->int_value = BREAKADDR_OP; return OPCODE;
 "CPTX_END"	printf("ENDING CUSTOM PTX.\n"); BEGIN(IN_COMMENT);
 
 <INITIAL,NOT_OPCODE,IN_INST,IN_FUNC_DECL>{
-\.a\.sync TC; yylval->int_value = LOAD_A; return WMMA_DIRECTIVE;
-\.b\.sync TC; yylval->int_value = LOAD_B; return WMMA_DIRECTIVE;
-\.c\.sync TC; yylval->int_value = LOAD_C; return WMMA_DIRECTIVE;
-\.d\.sync TC; yylval->int_value = STORE_D; return WMMA_DIRECTIVE;
-\.mma\.sync TC;yylval->int_value=MMA; return WMMA_DIRECTIVE;
+\.a\.sync\.aligned TC; yylval->int_value = LOAD_A; return WMMA_DIRECTIVE;
+\.b\.sync\.aligned TC; yylval->int_value = LOAD_B; return WMMA_DIRECTIVE;
+\.c\.sync\.aligned TC; yylval->int_value = LOAD_C; return WMMA_DIRECTIVE;
+\.d\.sync\.aligned TC; yylval->int_value = STORE_D; return WMMA_DIRECTIVE;
+\.mma\.sync\.aligned TC;yylval->int_value=MMA; return WMMA_DIRECTIVE;
 
 \.row TC; yylval->int_value = ROW; return LAYOUT;
 \.col TC; yylval->int_value = COL; return LAYOUT;
+\.m16n16k16\.global TC; yylval->int_value = M16N16K16; return CONFIGURATION;
+\.m32n8k16\.global TC; yylval->int_value = M32N8K16; return CONFIGURATION;
+\.m8n32k16\.global TC;  yylval->int_value = M8N32K16; return CONFIGURATION;
+
+\.m16n16k16\.shared TC; yylval->int_value = M16N16K16; return CONFIGURATION;
+\.m32n8k16\.shared TC; yylval->int_value = M32N8K16; return CONFIGURATION;
+\.m8n32k16\.shared TC;  yylval->int_value = M8N32K16; return CONFIGURATION;
+
 \.m16n16k16 TC; yylval->int_value = M16N16K16; return CONFIGURATION;
 \.m32n8k16 TC; yylval->int_value = M32N8K16; return CONFIGURATION;
 \.m8n32k16 TC;  yylval->int_value = M8N32K16; return CONFIGURATION;
@@ -305,6 +318,9 @@ breakaddr  TC; yylval->int_value = BREAKADDR_OP; return OPCODE;
 
 \.sat	TC; return SAT_OPTION;
 
+\.l		TC; return LEFT_OPTION;
+\.r		TC; return RIGHT_OPTION;
+
 \.eq    TC; return EQ_OPTION;
 \.ne    TC; return NE_OPTION;
 \.lt    TC; return LT_OPTION;
@@ -342,6 +358,8 @@ breakaddr  TC; yylval->int_value = BREAKADDR_OP; return OPCODE;
 \.arrive TC; return ARRIVE_OPTION;
 \.red TC; return RED_OPTION;
 
+\.clamp	TC; return CLAMP_OPTION;
+\.wrap	TC; return WRAP_OPTION;
 
 \.approx TC; return APPROX_OPTION;
 \.full  TC; return FULL_OPTION;
diff --git a/src/cuda-sim/ptx.y b/src/cuda-sim/ptx.y
index b38f78352..61183e88c 100644
--- a/src/cuda-sim/ptx.y
+++ b/src/cuda-sim/ptx.y
@@ -220,6 +220,10 @@ class ptx_recognizer;
 %token	PRMT_RC16_MODE;
 %token	PRMT_ECL_MODE;
 %token	PRMT_ECR_MODE;
+%token	WRAP_OPTION;
+%token	CLAMP_OPTION;
+%token	LEFT_OPTION;
+%token	RIGHT_OPTION;
 
 %type <int_value> function_decl_header
 %type <ptr_value> function_decl
@@ -507,6 +511,10 @@ option: type_spec
 	| DOWN_OPTION { recognizer->add_option(DOWN_OPTION); }
 	| BFLY_OPTION { recognizer->add_option(BFLY_OPTION); }
 	| IDX_OPTION { recognizer->add_option(IDX_OPTION); }
+	| WRAP_OPTION { recognizer->add_option(WRAP_OPTION); }
+	| CLAMP_OPTION { recognizer->add_option(CLAMP_OPTION); }
+	| LEFT_OPTION { recognizer->add_option(LEFT_OPTION); }
+	| RIGHT_OPTION { recognizer->add_option(RIGHT_OPTION); }
 	;
 
 atomic_operation_spec: ATOMIC_AND { recognizer->add_option(ATOMIC_AND); }
diff --git a/src/cuda-sim/ptx_ir.cc b/src/cuda-sim/ptx_ir.cc
index e5b5fb773..4e500ccb4 100644
--- a/src/cuda-sim/ptx_ir.cc
+++ b/src/cuda-sim/ptx_ir.cc
@@ -1,19 +1,22 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
-// George L. Yuan
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
+// George L. Yuan, Vijay Kandiah, Nikos Hardavellas,
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -136,6 +139,22 @@ symbol *symbol_table::lookup(const char *identifier) {
   return NULL;
 }
 
+symbol *symbol_table::lookup_by_addr(addr_t addr) {
+  for (auto it = m_symbols.begin(); it != m_symbols.end(); ++it) {
+    symbol *sym = it->second;
+
+    // check if symbol has the addr to be found
+    if ((!sym->is_reg()) && (sym->has_valid_address()) &&
+        (sym->get_address() == addr)) {
+      return sym;
+    }
+  }
+  if (m_parent) {
+    return m_parent->lookup_by_addr(addr);
+  }
+  return NULL;
+}
+
 symbol *symbol_table::add_variable(const char *identifier,
                                    const type_info *type, unsigned size,
                                    const char *filename, unsigned line) {
@@ -1147,8 +1166,8 @@ static std::list<operand_info> check_operands(
     const std::list<operand_info> &operands, gpgpu_context *ctx) {
   static int g_warn_literal_operands_two_type_inst;
   if ((opcode == CVT_OP) || (opcode == SET_OP) || (opcode == SLCT_OP) ||
-      (opcode == TEX_OP) || (opcode == MMA_OP) || (opcode == DP4A_OP) || 
-      (opcode == VMIN_OP) || (opcode == VMAX_OP) ) {
+      (opcode == TEX_OP) || (opcode == MMA_OP) || (opcode == DP4A_OP) ||
+      (opcode == VMIN_OP) || (opcode == VMAX_OP)) {
     // just make sure these do not have have const operands...
     if (!g_warn_literal_operands_two_type_inst) {
       std::list<operand_info>::const_iterator o;
@@ -1224,6 +1243,8 @@ ptx_instruction::ptx_instruction(
   m_rounding_mode = RN_OPTION;
   m_compare_op = -1;
   m_saturation_mode = 0;
+  m_clamp_mode = 0;
+  m_left_mode = 0;
   m_geom_spec = 0;
   m_vector_spec = 0;
   m_atomic_spec = 0;
@@ -1290,6 +1311,18 @@ ptx_instruction::ptx_instruction(
       case SAT_OPTION:
         m_saturation_mode = 1;
         break;
+      case WRAP_OPTION:
+        m_clamp_mode = 0;
+        break;
+      case CLAMP_OPTION:
+        m_clamp_mode = 1;
+        break;
+      case LEFT_OPTION:
+        m_left_mode = 1;
+        break;
+      case RIGHT_OPTION:
+        m_left_mode = 0;
+        break;
       case RNI_OPTION:
       case RZI_OPTION:
       case RMI_OPTION:
@@ -1384,6 +1417,8 @@ ptx_instruction::ptx_instruction(
       case CS_OPTION:
       case LU_OPTION:
       case CV_OPTION:
+      case WB_OPTION:
+      case WT_OPTION:
         m_cache_option = last_ptx_inst_option;
         break;
       case HALF_OPTION:
@@ -1465,8 +1500,8 @@ std::string ptx_instruction::to_string() const {
   char buf[STR_SIZE];
   unsigned used_bytes = 0;
   if (!is_label()) {
-    used_bytes +=
-        snprintf(buf + used_bytes, STR_SIZE - used_bytes, " PC=0x%03x ", m_PC);
+    used_bytes += snprintf(buf + used_bytes, STR_SIZE - used_bytes,
+                           " PC=0x%03llx ", m_PC);
   } else {
     used_bytes +=
         snprintf(buf + used_bytes, STR_SIZE - used_bytes, "                ");
diff --git a/src/cuda-sim/ptx_ir.h b/src/cuda-sim/ptx_ir.h
index 42439412c..b08a692d8 100644
--- a/src/cuda-sim/ptx_ir.h
+++ b/src/cuda-sim/ptx_ir.h
@@ -205,6 +205,7 @@ class symbol {
   const std::string &name() const { return m_name; }
   const std::string &decl_location() const { return m_decl_location; }
   const type_info *type() const { return m_type; }
+  bool has_valid_address() const { return m_address_valid; }
   addr_t get_address() const {
     assert(m_is_label ||
            !m_type->get_key().is_reg());  // todo : other assertions
@@ -310,6 +311,7 @@ class symbol_table {
   void set_ptx_version(float ver, unsigned ext);
   void set_sm_target(const char *target, const char *ext, const char *ext2);
   symbol *lookup(const char *identifier);
+  symbol *lookup_by_addr(addr_t addr);
   std::string get_scope_name() const { return m_scope_name; }
   symbol *add_variable(const char *identifier, const type_info *type,
                        unsigned size, const char *filename, unsigned line);
@@ -966,8 +968,8 @@ class ptx_instruction : public warp_inst_t {
   int get_pred_mod() const { return m_pred_mod; }
   const char *get_source() const { return m_source.c_str(); }
 
-  const std::list<int> get_scalar_type() const {return m_scalar_type;}
-  const std::list<int> get_options() const {return m_options;}
+  const std::list<int> get_scalar_type() const { return m_scalar_type; }
+  const std::list<int> get_options() const { return m_options; }
 
   typedef std::vector<operand_info>::const_iterator const_iterator;
 
@@ -1085,6 +1087,8 @@ class ptx_instruction : public warp_inst_t {
   unsigned cache_option() const { return m_cache_option; }
   unsigned rounding_mode() const { return m_rounding_mode; }
   unsigned saturation_mode() const { return m_saturation_mode; }
+  unsigned clamp_mode() const { return m_clamp_mode; }
+  unsigned left_mode() const { return m_left_mode; }
   unsigned dimension() const { return m_geom_spec; }
   unsigned barrier_op() const { return m_barrier_op; }
   unsigned shfl_op() const { return m_shfl_op; }
@@ -1159,6 +1163,8 @@ class ptx_instruction : public warp_inst_t {
   unsigned m_rounding_mode;
   unsigned m_compare_op;
   unsigned m_saturation_mode;
+  unsigned m_clamp_mode;
+  unsigned m_left_mode;
   unsigned m_barrier_op;
   unsigned m_shfl_op;
   unsigned m_prmt_op;
@@ -1248,6 +1254,7 @@ class function_info {
   const ptx_version &get_ptx_version() const {
     return m_symtab->get_ptx_version();
   }
+  virtual ~function_info() {}
   unsigned get_sm_target() const { return m_symtab->get_sm_target(); }
   bool is_extern() const { return m_extern; }
   void set_name(const char *name) { m_name = name; }
diff --git a/src/cuda-sim/ptx_loader.cc b/src/cuda-sim/ptx_loader.cc
index 4e91763e8..df354983e 100644
--- a/src/cuda-sim/ptx_loader.cc
+++ b/src/cuda-sim/ptx_loader.cc
@@ -95,7 +95,7 @@ void gpgpu_context::print_ptx_file(const char *p, unsigned source_num,
     const ptx_instruction *pI = ptx_parser->ptx_instruction_lookup(filename, n);
     char pc[64];
     if (pI && pI->get_PC())
-      snprintf(pc, 64, "%4u", pI->get_PC());
+      snprintf(pc, 64, "%4llu", pI->get_PC());
     else
       snprintf(pc, 64, "    ");
     printf("    _%u.ptx  %4u (pc=%s):  %s\n", source_num, n, pc, t);
@@ -240,7 +240,7 @@ void fix_duplicate_errors(char fname2[1024]) {
   unsigned oldlinenum = 1;
   unsigned linenum;
   char *startptr = ptxdata;
-  char *funcptr;
+  char *funcptr = NULL;
   char *tempptr = ptxdata - 1;
   char *lineptr = ptxdata - 1;
 
@@ -320,7 +320,7 @@ void fix_duplicate_errors(char fname2[1024]) {
 // we need the application name here too.
 char *get_app_binary_name() {
   char exe_path[1025];
-  char *self_exe_path;
+  char *self_exe_path = NULL;
 #ifdef __APPLE__
   // AMRUTH:  get apple device and check the result.
   printf("WARNING: not tested for Apple-mac devices \n");
diff --git a/src/cuda-sim/ptx_parser.cc b/src/cuda-sim/ptx_parser.cc
index afdb41ba8..a80eeae64 100644
--- a/src/cuda-sim/ptx_parser.cc
+++ b/src/cuda-sim/ptx_parser.cc
@@ -206,7 +206,7 @@ void ptx_recognizer::end_function() {
   gpgpu_ptx_assemble(g_func_info->get_name(), g_func_info);
   g_current_symbol_table = g_global_symbol_table;
 
-  PTX_PARSE_DPRINTF("function %s, PC = %d\n", g_func_info->get_name().c_str(),
+  PTX_PARSE_DPRINTF("function %s, PC = %llu\n", g_func_info->get_name().c_str(),
                     g_func_info->get_start_PC());
 }
 
@@ -486,7 +486,7 @@ void ptx_recognizer::add_identifier(const char *identifier, int array_dim,
     case param_space_local:
       printf(
           "GPGPU-Sim PTX: allocating stack frame region for .param \"%s\" from "
-          "0x%x to 0x%lx\n",
+          "0x%llx to 0x%llx\n",
           identifier, g_current_symbol_table->get_local_next(),
           g_current_symbol_table->get_local_next() + num_bits / 8);
       fflush(stdout);
@@ -521,7 +521,7 @@ void ptx_recognizer::add_constptr(const char *identifier1,
 
   unsigned addr = s2->get_address();
 
-  printf("GPGPU-Sim PTX: moving \"%s\" from 0x%x to 0x%x (%s+%x)\n",
+  printf("GPGPU-Sim PTX: moving \"%s\" from 0x%llx to 0x%x (%s+%d)\n",
          identifier1, s1->get_address(), addr + offset, identifier2, offset);
 
   s1->set_address(addr + offset);
@@ -622,13 +622,13 @@ void ptx_recognizer::add_scalar_type_spec(int type_spec) {
                     g_ptx_token_decode[type_spec].c_str());
   g_scalar_type.push_back(type_spec);
   if (g_scalar_type.size() > 1) {
-    parse_assert((g_opcode == -1) || (g_opcode == CVT_OP) ||
-                     (g_opcode == SET_OP) || (g_opcode == SLCT_OP) ||
-                     (g_opcode == TEX_OP) || (g_opcode == MMA_OP) ||
-                     (g_opcode == DP4A_OP) || (g_opcode == VMIN_OP) || 
-                     (g_opcode == VMAX_OP),
-                 "only cvt, set, slct, tex, vmin, vmax and dp4a can have more than one "
-                 "type specifier.");
+    parse_assert(
+        (g_opcode == -1) || (g_opcode == CVT_OP) || (g_opcode == SET_OP) ||
+            (g_opcode == SLCT_OP) || (g_opcode == TEX_OP) ||
+            (g_opcode == MMA_OP) || (g_opcode == DP4A_OP) ||
+            (g_opcode == VMIN_OP) || (g_opcode == VMAX_OP),
+        "only cvt, set, slct, tex, vmin, vmax and dp4a can have more than one "
+        "type specifier.");
   }
   g_scalar_type_spec = type_spec;
 }
diff --git a/src/cuda-sim/ptx_sim.cc b/src/cuda-sim/ptx_sim.cc
index dc801f8ca..2a548ee36 100644
--- a/src/cuda-sim/ptx_sim.cc
+++ b/src/cuda-sim/ptx_sim.cc
@@ -369,7 +369,8 @@ static void print_reg(FILE *fp, std::string name, ptx_reg_t value,
       fprintf(fp, ".u64 %llu [0x%llx]\n", value.u64, value.u64);
       break;
     case F16_TYPE:
-      fprintf(fp, ".f16 %f [0x%04x]\n", value.f16, (unsigned)value.u16);
+      fprintf(fp, ".f16 %f [0x%04x]\n", static_cast<float>(value.f16),
+              (unsigned)value.u16);
       break;
     case F32_TYPE:
       fprintf(fp, ".f32 %.15lf [0x%08x]\n", value.f32, value.u32);
diff --git a/src/cuda-sim/ptx_sim.h b/src/cuda-sim/ptx_sim.h
index f0c26efc8..8eec922e4 100644
--- a/src/cuda-sim/ptx_sim.h
+++ b/src/cuda-sim/ptx_sim.h
@@ -459,6 +459,9 @@ class ptx_thread_info {
   // Jin: get corresponding kernel grid for CDP purpose
   kernel_info_t &get_kernel() { return m_kernel; }
 
+  // Weili: access symbol_table
+  symbol_table *get_symbol_table() { return m_symbol_table; }
+
  public:
   addr_t m_last_effective_address;
   bool m_branch_taken;
diff --git a/src/debug.cc b/src/debug.cc
index 29506bd75..8cc5e1f52 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -124,7 +124,7 @@ void gpgpu_sim::gpgpu_debug() {
     fflush(stdout);
 
     char line[1024];
-    fgets(line, 1024, stdin);
+    char *ptr = fgets(line, 1024, stdin);
 
     char *tok = strtok(line, " \t\n");
     if (!strcmp(tok, "dp")) {
@@ -136,7 +136,11 @@ void gpgpu_sim::gpgpu_debug() {
       fflush(stdout);
     } else if (!strcmp(tok, "q") || !strcmp(tok, "quit")) {
       printf("\nreally quit GPGPU-Sim (y/n)?\n");
-      fgets(line, 1024, stdin);
+      ptr = fgets(line, 1024, stdin);
+      if (ptr == NULL) {
+        printf("can't read input\n");
+        exit(0);
+      }
       tok = strtok(line, " \t\n");
       if (!strcmp(tok, "y")) {
         exit(0);
diff --git a/src/gpgpu-sim/CMakeLists.txt b/src/gpgpu-sim/CMakeLists.txt
new file mode 100644
index 000000000..04f197307
--- /dev/null
+++ b/src/gpgpu-sim/CMakeLists.txt
@@ -0,0 +1,36 @@
+# Exclude power_interface.cc if no power model
+list(APPEND gpgpusim_SRC addrdec.cc
+    dram.cc
+    dram_sched.cc
+    gpu-cache.cc
+    gpu-misc.cc
+    gpu-sim.cc
+    hashing.cc
+    histogram.cc
+    icnt_wrapper.cc
+    l2cache.cc
+    local_interconnect.cc
+    mem_fetch.cc
+    mem_latency_stat.cc
+    power_interface.cc
+    power_stat.cc
+    scoreboard.cc
+    shader.cc
+    stack.cc
+    stat-tool.cc
+    traffic_breakdown.cc
+    visualizer.cc)
+if(NOT GPGPUSIM_USE_POWER_MODEL)
+    list(REMOVE_ITEM ${gpgpusim_SRC} power_interface.cc)
+endif()
+
+# Create libgpgpusim.a
+add_library(gpgpusim STATIC ${gpgpusim_SRC})
+target_include_directories(gpgpusim PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(gpgpusim PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+
+if(GPGPUSIM_USE_POWER_MODEL)
+target_compile_definitions(gpgpusim PRIVATE GPGPUSIM_POWER_MODEL)
+target_include_directories(gpgpusim PRIVATE ${GPGPUSIM_POWER_MODEL})
+endif()
+
diff --git a/src/gpgpu-sim/addrdec.cc b/src/gpgpu-sim/addrdec.cc
index 19714ec99..db27c825b 100644
--- a/src/gpgpu-sim/addrdec.cc
+++ b/src/gpgpu-sim/addrdec.cc
@@ -519,7 +519,7 @@ void linear_to_raw_address_translation::sweep_test() const {
           h->second, raw_addr);
       abort();
     } else {
-      assert((int)tlx.chip < m_n_channel);
+      assert(tlx.chip < m_n_channel);
       // ensure that partition_address() returns the concatenated address
       if ((ADDR_CHIP_S != -1 and raw_addr >= (1ULL << ADDR_CHIP_S)) or
           (ADDR_CHIP_S == -1 and raw_addr >= (1ULL << addrdec_mklow[CHIP]))) {
@@ -584,7 +584,7 @@ unsigned next_powerOf2(unsigned n) {
   n = n - 1;
 
   // do till only one bit is left
-  while (n & n - 1) n = n & (n - 1);  // unset rightmost bit
+  while (n & (n - 1)) n = n & (n - 1);  // unset rightmost bit
 
   // n is now a power of two (less than n)
 
diff --git a/src/gpgpu-sim/dram.cc b/src/gpgpu-sim/dram.cc
index ca47c4684..80e20d795 100644
--- a/src/gpgpu-sim/dram.cc
+++ b/src/gpgpu-sim/dram.cc
@@ -1,19 +1,22 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// Ivan Sham, George L. Yuan,
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
+// Ivan Sham, George L. Yuan, Vijay Kandiah, Nikos Hardavellas,
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -590,39 +593,40 @@ bool dram_t::issue_col_command(int j) {
         bk[j]->mrq = NULL;
       }
     } else
-        // correct row activated for a WRITE
-        if (!issued && !CCDc && !bk[j]->RCDWRc && !(bkgrp[grp]->CCDLc) &&
-            (bk[j]->curr_row == bk[j]->mrq->row) && (bk[j]->mrq->rw == WRITE) &&
-            (RTWc == 0) && (bk[j]->state == BANK_ACTIVE) && !rwq->full()) {
-      if (rw == READ) {
-        rw = WRITE;
-        rwq->set_min_length(m_config->WL);
-      }
-      rwq->push(bk[j]->mrq);
-
-      bk[j]->mrq->txbytes += m_config->dram_atom_size;
-      CCDc = m_config->tCCD;
-      bkgrp[grp]->CCDLc = m_config->tCCDL;
-      WTRc = m_config->tWTR;
-      bk[j]->WTPc = m_config->tWTP;
-      issued = true;
-
-      if (bk[j]->mrq->data->get_access_type() == L2_WRBK_ACC)
-        n_wr_WB++;
-      else
-        n_wr++;
-      bwutil += m_config->BL / m_config->data_command_freq_ratio;
-      bwutil_partial += m_config->BL / m_config->data_command_freq_ratio;
+      // correct row activated for a WRITE
+      if (!issued && !CCDc && !bk[j]->RCDWRc && !(bkgrp[grp]->CCDLc) &&
+          (bk[j]->curr_row == bk[j]->mrq->row) && (bk[j]->mrq->rw == WRITE) &&
+          (RTWc == 0) && (bk[j]->state == BANK_ACTIVE) && !rwq->full()) {
+        if (rw == READ) {
+          rw = WRITE;
+          rwq->set_min_length(m_config->WL);
+        }
+        rwq->push(bk[j]->mrq);
+
+        bk[j]->mrq->txbytes += m_config->dram_atom_size;
+        CCDc = m_config->tCCD;
+        bkgrp[grp]->CCDLc = m_config->tCCDL;
+        WTRc = m_config->tWTR;
+        bk[j]->WTPc = m_config->tWTP;
+        issued = true;
+
+        if (bk[j]->mrq->data->get_access_type() == L2_WRBK_ACC)
+          n_wr_WB++;
+        else
+          n_wr++;
+        bwutil += m_config->BL / m_config->data_command_freq_ratio;
+        bwutil_partial += m_config->BL / m_config->data_command_freq_ratio;
 #ifdef DRAM_VERIFY
-      PRINT_CYCLE = 1;
-      printf("\tWR  Bk:%d Row:%03x Col:%03x \n", j, bk[j]->curr_row,
-             bk[j]->mrq->col + bk[j]->mrq->txbytes - m_config->dram_atom_size);
+        PRINT_CYCLE = 1;
+        printf(
+            "\tWR  Bk:%d Row:%03x Col:%03x \n", j, bk[j]->curr_row,
+            bk[j]->mrq->col + bk[j]->mrq->txbytes - m_config->dram_atom_size);
 #endif
-      // transfer done
-      if (!(bk[j]->mrq->txbytes < bk[j]->mrq->nbytes)) {
-        bk[j]->mrq = NULL;
+        // transfer done
+        if (!(bk[j]->mrq->txbytes < bk[j]->mrq->nbytes)) {
+          bk[j]->mrq = NULL;
+        }
       }
-    }
   }
 
   return issued;
@@ -658,23 +662,23 @@ bool dram_t::issue_row_command(int j) {
     }
 
     else
-        // different row activated
-        if ((!issued) && (bk[j]->curr_row != bk[j]->mrq->row) &&
-            (bk[j]->state == BANK_ACTIVE) &&
-            (!bk[j]->RASc && !bk[j]->WTPc && !bk[j]->RTPc &&
-             !bkgrp[grp]->RTPLc)) {
-      // make the bank idle again
-      bk[j]->state = BANK_IDLE;
-      bk[j]->RPc = m_config->tRP;
-      prio = (j + 1) % m_config->nbk;
-      issued = true;
-      n_pre++;
-      n_pre_partial++;
+      // different row activated
+      if ((!issued) && (bk[j]->curr_row != bk[j]->mrq->row) &&
+          (bk[j]->state == BANK_ACTIVE) &&
+          (!bk[j]->RASc && !bk[j]->WTPc && !bk[j]->RTPc &&
+           !bkgrp[grp]->RTPLc)) {
+        // make the bank idle again
+        bk[j]->state = BANK_IDLE;
+        bk[j]->RPc = m_config->tRP;
+        prio = (j + 1) % m_config->nbk;
+        issued = true;
+        n_pre++;
+        n_pre_partial++;
 #ifdef DRAM_VERIFY
-      PRINT_CYCLE = 1;
-      printf("\tPRE BK:%d Row:%03x \n", j, bk[j]->curr_row);
+        PRINT_CYCLE = 1;
+        printf("\tPRE BK:%d Row:%03x \n", j, bk[j]->curr_row);
 #endif
-    }
+      }
   }
   return issued;
 }
@@ -855,7 +859,7 @@ void dram_t::visualizer_print(gzFile visualizer_file) {
 
 void dram_t::set_dram_power_stats(unsigned &cmd, unsigned &activity,
                                   unsigned &nop, unsigned &act, unsigned &pre,
-                                  unsigned &rd, unsigned &wr,
+                                  unsigned &rd, unsigned &wr, unsigned &wr_WB,
                                   unsigned &req) const {
   // Point power performance counters to low-level DRAM counters
   cmd = n_cmd;
@@ -865,6 +869,7 @@ void dram_t::set_dram_power_stats(unsigned &cmd, unsigned &activity,
   pre = n_pre;
   rd = n_rd;
   wr = n_wr;
+  wr_WB = n_wr_WB;
   req = n_req;
 }
 
@@ -877,4 +882,5 @@ unsigned dram_t::get_bankgrp_number(unsigned i) {
   } else {
     assert(1);
   }
+  return 0;  // we should never get here
 }
diff --git a/src/gpgpu-sim/dram.h b/src/gpgpu-sim/dram.h
index 6c212e9be..9e9517b9d 100644
--- a/src/gpgpu-sim/dram.h
+++ b/src/gpgpu-sim/dram.h
@@ -1,19 +1,22 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ivan Sham, Ali Bakhoda,
-// George L. Yuan, Wilson W.L. Fung
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ivan Sham, Ali Bakhoda,
+// George L. Yuan, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas,
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -135,7 +138,7 @@ class dram_t {
   // Power Model
   void set_dram_power_stats(unsigned &cmd, unsigned &activity, unsigned &nop,
                             unsigned &act, unsigned &pre, unsigned &rd,
-                            unsigned &wr, unsigned &req) const;
+                            unsigned &wr, unsigned &wr_WB, unsigned &req) const;
 
   const memory_config *m_config;
 
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 75c369136..0ea9ff63d 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1,18 +1,22 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington,
+// Vijay Kandiah, Nikos Hardavellas, Mahmoud Khairy, Junrui Pan,
+// Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -37,7 +41,8 @@
 
 const char *cache_request_status_str(enum cache_request_status status) {
   static const char *static_cache_request_status_str[] = {
-      "HIT", "HIT_RESERVED", "MISS", "RESERVATION_FAIL", "SECTOR_MISS"};
+      "HIT",         "HIT_RESERVED", "MISS", "RESERVATION_FAIL",
+      "SECTOR_MISS", "MSHR_HIT"};
 
   assert(sizeof(static_cache_request_status_str) / sizeof(const char *) ==
          NUM_CACHE_REQUEST_STATUS);
@@ -63,9 +68,9 @@ unsigned l1d_cache_config::set_bank(new_addr_type addr) const {
   // For sector cache, we select one sector per bank (sector interleaving)
   // This is what was found in Volta (one sector per bank, sector interleaving)
   // otherwise, line interleaving
-  return cache_config::hash_function(addr, l1_banks, l1_banks_byte_interleaving,
-                                     m_l1_banks_log2,
-                                     l1_banks_hashing_function);
+  return cache_config::hash_function(addr, l1_banks,
+                                     l1_banks_byte_interleaving_log2,
+                                     l1_banks_log2, l1_banks_hashing_function);
 }
 
 unsigned cache_config::set_index(new_addr_type addr) const {
@@ -210,6 +215,7 @@ void tag_array::init(int core_id, int type_id) {
   m_core_id = core_id;
   m_type_id = type_id;
   is_used = false;
+  m_dirty = 0;
 }
 
 void tag_array::add_pending_line(mem_fetch *mf) {
@@ -231,15 +237,15 @@ void tag_array::remove_pending_line(mem_fetch *mf) {
 }
 
 enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
-                                           mem_fetch *mf,
+                                           mem_fetch *mf, bool is_write,
                                            bool probe_mode) const {
   mem_access_sector_mask_t mask = mf->get_access_sector_mask();
-  return probe(addr, idx, mask, probe_mode, mf);
+  return probe(addr, idx, mask, is_write, probe_mode, mf);
 }
 
 enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
                                            mem_access_sector_mask_t mask,
-                                           bool probe_mode,
+                                           bool is_write, bool probe_mode,
                                            mem_fetch *mf) const {
   // assert( m_config.m_write_policy == READ_ONLY );
   unsigned set_index = m_config.set_index(addr);
@@ -250,7 +256,6 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
   unsigned long long valid_timestamp = (unsigned)-1;
 
   bool all_reserved = true;
-
   // check for hit or pending hit
   for (unsigned way = 0; way < m_config.m_assoc; way++) {
     unsigned index = set_index * m_config.m_assoc + way;
@@ -263,7 +268,7 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
         idx = index;
         return HIT;
       } else if (line->get_status(mask) == MODIFIED) {
-        if (line->is_readable(mask)) {
+        if ((!is_write && line->is_readable(mask)) || is_write) {
           idx = index;
           return HIT;
         } else {
@@ -279,20 +284,32 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
       }
     }
     if (!line->is_reserved_line()) {
-      all_reserved = false;
-      if (line->is_invalid_line()) {
-        invalid_line = index;
-      } else {
-        // valid line : keep track of most appropriate replacement candidate
-        if (m_config.m_replacement_policy == LRU) {
-          if (line->get_last_access_time() < valid_timestamp) {
-            valid_timestamp = line->get_last_access_time();
-            valid_line = index;
-          }
-        } else if (m_config.m_replacement_policy == FIFO) {
-          if (line->get_alloc_time() < valid_timestamp) {
-            valid_timestamp = line->get_alloc_time();
-            valid_line = index;
+      // percentage of dirty lines in the cache
+      // number of dirty lines / total lines in the cache
+      float dirty_line_percentage =
+          ((float)m_dirty / (m_config.m_nset * m_config.m_assoc)) * 100;
+      // If the cacheline is from a load op (not modified),
+      // or the total dirty cacheline is above a specific value,
+      // Then this cacheline is eligible to be considered for replacement
+      // candidate i.e. Only evict clean cachelines until total dirty cachelines
+      // reach the limit.
+      if (!line->is_modified_line() ||
+          dirty_line_percentage >= m_config.m_wr_percent) {
+        all_reserved = false;
+        if (line->is_invalid_line()) {
+          invalid_line = index;
+        } else {
+          // valid line : keep track of most appropriate replacement candidate
+          if (m_config.m_replacement_policy == LRU) {
+            if (line->get_last_access_time() < valid_timestamp) {
+              valid_timestamp = line->get_last_access_time();
+              valid_line = index;
+            }
+          } else if (m_config.m_replacement_policy == FIFO) {
+            if (line->get_alloc_time() < valid_timestamp) {
+              valid_timestamp = line->get_alloc_time();
+              valid_line = index;
+            }
           }
         }
       }
@@ -312,15 +329,6 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
     abort();  // if an unreserved block exists, it is either invalid or
               // replaceable
 
-  if (probe_mode && m_config.is_streaming()) {
-    line_table::const_iterator i =
-        pending_lines.find(m_config.block_addr(addr));
-    assert(mf);
-    if (!mf->is_write() && i != pending_lines.end()) {
-      if (i->second != mf->get_inst().get_uid()) return SECTOR_MISS;
-    }
-  }
-
   return MISS;
 }
 
@@ -340,7 +348,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
   m_access++;
   is_used = true;
   shader_cache_access_log(m_core_id, m_type_id, 0);  // log accesses to cache
-  enum cache_request_status status = probe(addr, idx, mf);
+  enum cache_request_status status = probe(addr, idx, mf, mf->is_write());
   switch (status) {
     case HIT_RESERVED:
       m_pending_hit++;
@@ -353,8 +361,12 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
       if (m_config.m_alloc_policy == ON_MISS) {
         if (m_lines[idx]->is_modified_line()) {
           wb = true;
+          // m_lines[idx]->set_byte_mask(mf);
           evicted.set_info(m_lines[idx]->m_block_addr,
-                           m_lines[idx]->get_modified_size());
+                           m_lines[idx]->get_modified_size(),
+                           m_lines[idx]->get_dirty_byte_mask(),
+                           m_lines[idx]->get_dirty_sector_mask());
+          m_dirty--;
         }
         m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr),
                                time, mf->get_access_sector_mask());
@@ -365,8 +377,12 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
       m_sector_miss++;
       shader_cache_access_log(m_core_id, m_type_id, 1);  // log cache misses
       if (m_config.m_alloc_policy == ON_MISS) {
+        bool before = m_lines[idx]->is_modified_line();
         ((sector_cache_block *)m_lines[idx])
             ->allocate_sector(time, mf->get_access_sector_mask());
+        if (before && !m_lines[idx]->is_modified_line()) {
+          m_dirty--;
+        }
       }
       break;
     case RESERVATION_FAIL:
@@ -383,31 +399,51 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
   return status;
 }
 
-void tag_array::fill(new_addr_type addr, unsigned time, mem_fetch *mf) {
-  fill(addr, time, mf->get_access_sector_mask());
+void tag_array::fill(new_addr_type addr, unsigned time, mem_fetch *mf,
+                     bool is_write) {
+  fill(addr, time, mf->get_access_sector_mask(), mf->get_access_byte_mask(),
+       is_write);
 }
 
 void tag_array::fill(new_addr_type addr, unsigned time,
-                     mem_access_sector_mask_t mask) {
+                     mem_access_sector_mask_t mask,
+                     mem_access_byte_mask_t byte_mask, bool is_write) {
   // assert( m_config.m_alloc_policy == ON_FILL );
   unsigned idx;
-  enum cache_request_status status = probe(addr, idx, mask);
+  enum cache_request_status status = probe(addr, idx, mask, is_write);
+
+  if (status == RESERVATION_FAIL) {
+    return;
+  }
+
+  bool before = m_lines[idx]->is_modified_line();
   // assert(status==MISS||status==SECTOR_MISS); // MSHR should have prevented
   // redundant memory request
-  if (status == MISS)
+  if (status == MISS) {
     m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr), time,
                            mask);
-  else if (status == SECTOR_MISS) {
+  } else if (status == SECTOR_MISS) {
     assert(m_config.m_cache_type == SECTOR);
     ((sector_cache_block *)m_lines[idx])->allocate_sector(time, mask);
   }
-
-  m_lines[idx]->fill(time, mask);
+  if (before && !m_lines[idx]->is_modified_line()) {
+    m_dirty--;
+  }
+  before = m_lines[idx]->is_modified_line();
+  m_lines[idx]->fill(time, mask, byte_mask);
+  if (m_lines[idx]->is_modified_line() && !before) {
+    m_dirty++;
+  }
 }
 
 void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
   assert(m_config.m_alloc_policy == ON_MISS);
-  m_lines[index]->fill(time, mf->get_access_sector_mask());
+  bool before = m_lines[index]->is_modified_line();
+  m_lines[index]->fill(time, mf->get_access_sector_mask(),
+                       mf->get_access_byte_mask());
+  if (m_lines[index]->is_modified_line() && !before) {
+    m_dirty++;
+  }
 }
 
 // TODO: we need write back the flushed data to the upper level
@@ -416,10 +452,12 @@ void tag_array::flush() {
 
   for (unsigned i = 0; i < m_config.get_num_lines(); i++)
     if (m_lines[i]->is_modified_line()) {
-      for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)
+      for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++) {
         m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
+      }
     }
 
+  m_dirty = 0;
   is_used = false;
 }
 
@@ -430,6 +468,7 @@ void tag_array::invalidate() {
     for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)
       m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
 
+  m_dirty = 0;
   is_used = false;
 }
 
@@ -485,8 +524,10 @@ bool was_writeback_sent(const std::list<cache_event> &events,
                         cache_event &wb_event) {
   for (std::list<cache_event>::const_iterator e = events.begin();
        e != events.end(); e++) {
-    if ((*e).m_cache_event_type == WRITE_BACK_REQUEST_SENT) wb_event = *e;
-    return true;
+    if ((*e).m_cache_event_type == WRITE_BACK_REQUEST_SENT) {
+      wb_event = *e;
+      return true;
+    }
   }
   return false;
 }
@@ -593,14 +634,6 @@ void mshr_table::display(FILE *fp) const {
 /***************************************************************** Caches
  * *****************************************************************/
 cache_stats::cache_stats() {
-  m_stats.resize(NUM_MEM_ACCESS_TYPE);
-  m_stats_pw.resize(NUM_MEM_ACCESS_TYPE);
-  m_fail_stats.resize(NUM_MEM_ACCESS_TYPE);
-  for (unsigned i = 0; i < NUM_MEM_ACCESS_TYPE; ++i) {
-    m_stats[i].resize(NUM_CACHE_REQUEST_STATUS, 0);
-    m_stats_pw[i].resize(NUM_CACHE_REQUEST_STATUS, 0);
-    m_fail_stats[i].resize(NUM_CACHE_RESERVATION_FAIL_STATUS, 0);
-  }
   m_cache_port_available_cycles = 0;
   m_cache_data_port_busy_cycles = 0;
   m_cache_fill_port_busy_cycles = 0;
@@ -610,10 +643,10 @@ void cache_stats::clear() {
   ///
   /// Zero out all current cache statistics
   ///
-  for (unsigned i = 0; i < NUM_MEM_ACCESS_TYPE; ++i) {
-    std::fill(m_stats[i].begin(), m_stats[i].end(), 0);
-    std::fill(m_fail_stats[i].begin(), m_fail_stats[i].end(), 0);
-  }
+  m_stats.clear();
+  m_stats_pw.clear();
+  m_fail_stats.clear();
+
   m_cache_port_available_cycles = 0;
   m_cache_data_port_busy_cycles = 0;
   m_cache_fill_port_busy_cycles = 0;
@@ -623,35 +656,67 @@ void cache_stats::clear_pw() {
   ///
   /// Zero out per-window cache statistics
   ///
-  for (unsigned i = 0; i < NUM_MEM_ACCESS_TYPE; ++i) {
-    std::fill(m_stats_pw[i].begin(), m_stats_pw[i].end(), 0);
-  }
+  m_stats_pw.clear();
 }
 
-void cache_stats::inc_stats(int access_type, int access_outcome) {
+void cache_stats::inc_stats(int access_type, int access_outcome,
+                            unsigned long long streamID) {
   ///
   /// Increment the stat corresponding to (access_type, access_outcome) by 1.
   ///
   if (!check_valid(access_type, access_outcome))
     assert(0 && "Unknown cache access type or access outcome");
 
-  m_stats[access_type][access_outcome]++;
+  if (m_stats.find(streamID) == m_stats.end()) {
+    std::vector<std::vector<unsigned long long>> new_val;
+    new_val.resize(NUM_MEM_ACCESS_TYPE);
+    for (unsigned j = 0; j < NUM_MEM_ACCESS_TYPE; ++j) {
+      new_val[j].resize(NUM_CACHE_REQUEST_STATUS, 0);
+    }
+    m_stats.insert(std::pair<unsigned long long,
+                             std::vector<std::vector<unsigned long long>>>(
+        streamID, new_val));
+  }
+  m_stats.at(streamID)[access_type][access_outcome]++;
 }
 
-void cache_stats::inc_stats_pw(int access_type, int access_outcome) {
+void cache_stats::inc_stats_pw(int access_type, int access_outcome,
+                               unsigned long long streamID) {
   ///
   /// Increment the corresponding per-window cache stat
   ///
   if (!check_valid(access_type, access_outcome))
     assert(0 && "Unknown cache access type or access outcome");
-  m_stats_pw[access_type][access_outcome]++;
+
+  if (m_stats_pw.find(streamID) == m_stats_pw.end()) {
+    std::vector<std::vector<unsigned long long>> new_val;
+    new_val.resize(NUM_MEM_ACCESS_TYPE);
+    for (unsigned j = 0; j < NUM_MEM_ACCESS_TYPE; ++j) {
+      new_val[j].resize(NUM_CACHE_REQUEST_STATUS, 0);
+    }
+    m_stats_pw.insert(std::pair<unsigned long long,
+                                std::vector<std::vector<unsigned long long>>>(
+        streamID, new_val));
+  }
+  m_stats_pw.at(streamID)[access_type][access_outcome]++;
 }
 
-void cache_stats::inc_fail_stats(int access_type, int fail_outcome) {
+void cache_stats::inc_fail_stats(int access_type, int fail_outcome,
+                                 unsigned long long streamID) {
   if (!check_fail_valid(access_type, fail_outcome))
     assert(0 && "Unknown cache access type or access fail");
 
-  m_fail_stats[access_type][fail_outcome]++;
+  if (m_fail_stats.find(streamID) == m_fail_stats.end()) {
+    std::vector<std::vector<unsigned long long>> new_val;
+    new_val.resize(NUM_MEM_ACCESS_TYPE);
+    for (unsigned j = 0; j < NUM_MEM_ACCESS_TYPE; ++j) {
+      new_val[j].resize(NUM_CACHE_RESERVATION_FAIL_STATUS, 0);
+    }
+    m_fail_stats.insert(std::pair<unsigned long long,
+                                  std::vector<std::vector<unsigned long long>>>(
+        streamID, new_val));
+  }
+  m_fail_stats.at(streamID)[access_type][fail_outcome]++;
 }
 
 enum cache_request_status cache_stats::select_stats_status(
@@ -670,7 +735,8 @@ enum cache_request_status cache_stats::select_stats_status(
 }
 
 unsigned long long &cache_stats::operator()(int access_type, int access_outcome,
-                                            bool fail_outcome) {
+                                            bool fail_outcome,
+                                            unsigned long long streamID) {
   ///
   /// Simple method to read/modify the stat corresponding to (access_type,
   /// access_outcome) Used overloaded () to avoid the need for separate
@@ -680,17 +746,18 @@ unsigned long long &cache_stats::operator()(int access_type, int access_outcome,
     if (!check_fail_valid(access_type, access_outcome))
       assert(0 && "Unknown cache access type or fail outcome");
 
-    return m_fail_stats[access_type][access_outcome];
+    return m_fail_stats.at(streamID)[access_type][access_outcome];
   } else {
     if (!check_valid(access_type, access_outcome))
       assert(0 && "Unknown cache access type or access outcome");
 
-    return m_stats[access_type][access_outcome];
+    return m_stats.at(streamID)[access_type][access_outcome];
   }
 }
 
 unsigned long long cache_stats::operator()(int access_type, int access_outcome,
-                                           bool fail_outcome) const {
+                                           bool fail_outcome,
+                                           unsigned long long streamID) const {
   ///
   /// Const accessor into m_stats.
   ///
@@ -698,12 +765,12 @@ unsigned long long cache_stats::operator()(int access_type, int access_outcome,
     if (!check_fail_valid(access_type, access_outcome))
       assert(0 && "Unknown cache access type or fail outcome");
 
-    return m_fail_stats[access_type][access_outcome];
+    return m_fail_stats.at(streamID)[access_type][access_outcome];
   } else {
     if (!check_valid(access_type, access_outcome))
       assert(0 && "Unknown cache access type or access outcome");
 
-    return m_stats[access_type][access_outcome];
+    return m_stats.at(streamID)[access_type][access_outcome];
   }
 }
 
@@ -712,15 +779,74 @@ cache_stats cache_stats::operator+(const cache_stats &cs) {
   /// Overloaded + operator to allow for simple stat accumulation
   ///
   cache_stats ret;
-  for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
-    for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
-      ret(type, status, false) =
-          m_stats[type][status] + cs(type, status, false);
+  for (auto iter = m_stats.begin(); iter != m_stats.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    ret.m_stats.insert(std::pair<unsigned long long,
+                                 std::vector<std::vector<unsigned long long>>>(
+        streamID, m_stats.at(streamID)));
+  }
+  for (auto iter = m_stats_pw.begin(); iter != m_stats_pw.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    ret.m_stats_pw.insert(
+        std::pair<unsigned long long,
+                  std::vector<std::vector<unsigned long long>>>(
+            streamID, m_stats_pw.at(streamID)));
+  }
+  for (auto iter = m_fail_stats.begin(); iter != m_fail_stats.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    ret.m_fail_stats.insert(
+        std::pair<unsigned long long,
+                  std::vector<std::vector<unsigned long long>>>(
+            streamID, m_fail_stats.at(streamID)));
+  }
+  for (auto iter = cs.m_stats.begin(); iter != cs.m_stats.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    if (ret.m_stats.find(streamID) == ret.m_stats.end()) {
+      ret.m_stats.insert(
+          std::pair<unsigned long long,
+                    std::vector<std::vector<unsigned long long>>>(
+              streamID, cs.m_stats.at(streamID)));
+    } else {
+      for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+        for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
+          ret.m_stats.at(streamID)[type][status] +=
+              cs(type, status, false, streamID);
+        }
+      }
+    }
+  }
+  for (auto iter = cs.m_stats_pw.begin(); iter != cs.m_stats_pw.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    if (ret.m_stats_pw.find(streamID) == ret.m_stats_pw.end()) {
+      ret.m_stats_pw.insert(
+          std::pair<unsigned long long,
+                    std::vector<std::vector<unsigned long long>>>(
+              streamID, cs.m_stats_pw.at(streamID)));
+    } else {
+      for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+        for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
+          ret.m_stats_pw.at(streamID)[type][status] +=
+              cs(type, status, false, streamID);
+        }
+      }
     }
-    for (unsigned status = 0; status < NUM_CACHE_RESERVATION_FAIL_STATUS;
-         ++status) {
-      ret(type, status, true) =
-          m_fail_stats[type][status] + cs(type, status, true);
+  }
+  for (auto iter = cs.m_fail_stats.begin(); iter != cs.m_fail_stats.end();
+       ++iter) {
+    unsigned long long streamID = iter->first;
+    if (ret.m_fail_stats.find(streamID) == ret.m_fail_stats.end()) {
+      ret.m_fail_stats.insert(
+          std::pair<unsigned long long,
+                    std::vector<std::vector<unsigned long long>>>(
+              streamID, cs.m_fail_stats.at(streamID)));
+    } else {
+      for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+        for (unsigned status = 0; status < NUM_CACHE_RESERVATION_FAIL_STATUS;
+             ++status) {
+          ret.m_fail_stats.at(streamID)[type][status] +=
+              cs(type, status, true, streamID);
+        }
+      }
     }
   }
   ret.m_cache_port_available_cycles =
@@ -736,16 +862,52 @@ cache_stats &cache_stats::operator+=(const cache_stats &cs) {
   ///
   /// Overloaded += operator to allow for simple stat accumulation
   ///
-  for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
-    for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
-      m_stats[type][status] += cs(type, status, false);
+  for (auto iter = cs.m_stats.begin(); iter != cs.m_stats.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    if (m_stats.find(streamID) == m_stats.end()) {
+      m_stats.insert(std::pair<unsigned long long,
+                               std::vector<std::vector<unsigned long long>>>(
+          streamID, cs.m_stats.at(streamID)));
+    } else {
+      for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+        for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
+          m_stats.at(streamID)[type][status] +=
+              cs(type, status, false, streamID);
+        }
+      }
     }
-    for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
-      m_stats_pw[type][status] += cs(type, status, false);
+  }
+  for (auto iter = cs.m_stats_pw.begin(); iter != cs.m_stats_pw.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    if (m_stats_pw.find(streamID) == m_stats_pw.end()) {
+      m_stats_pw.insert(std::pair<unsigned long long,
+                                  std::vector<std::vector<unsigned long long>>>(
+          streamID, cs.m_stats_pw.at(streamID)));
+    } else {
+      for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+        for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
+          m_stats_pw.at(streamID)[type][status] +=
+              cs(type, status, false, streamID);
+        }
+      }
     }
-    for (unsigned status = 0; status < NUM_CACHE_RESERVATION_FAIL_STATUS;
-         ++status) {
-      m_fail_stats[type][status] += cs(type, status, true);
+  }
+  for (auto iter = cs.m_fail_stats.begin(); iter != cs.m_fail_stats.end();
+       ++iter) {
+    unsigned long long streamID = iter->first;
+    if (m_fail_stats.find(streamID) == m_fail_stats.end()) {
+      m_fail_stats.insert(
+          std::pair<unsigned long long,
+                    std::vector<std::vector<unsigned long long>>>(
+              streamID, cs.m_fail_stats.at(streamID)));
+    } else {
+      for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+        for (unsigned status = 0; status < NUM_CACHE_RESERVATION_FAIL_STATUS;
+             ++status) {
+          m_fail_stats.at(streamID)[type][status] +=
+              cs(type, status, true, streamID);
+        }
+      }
     }
   }
   m_cache_port_available_cycles += cs.m_cache_port_available_cycles;
@@ -754,44 +916,65 @@ cache_stats &cache_stats::operator+=(const cache_stats &cs) {
   return *this;
 }
 
-void cache_stats::print_stats(FILE *fout, const char *cache_name) const {
+void cache_stats::print_stats(FILE *fout, unsigned long long streamID,
+                              const char *cache_name) const {
   ///
-  /// Print out each non-zero cache statistic for every memory access type and
-  /// status "cache_name" defaults to "Cache_stats" when no argument is
-  /// provided, otherwise the provided name is used. The printed format is
+  /// For a given CUDA stream, print out each non-zero cache statistic for every
+  /// memory access type and status "cache_name" defaults to "Cache_stats" when
+  /// no argument is provided, otherwise the provided name is used. The printed
+  /// format is
   /// "<cache_name>[<request_type>][<request_status>] = <stat_value>"
-  ///
+  /// Specify streamID to be -1 to print every stream.
+
   std::vector<unsigned> total_access;
-  total_access.resize(NUM_MEM_ACCESS_TYPE, 0);
   std::string m_cache_name = cache_name;
-  for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
-    for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
-      fprintf(fout, "\t%s[%s][%s] = %llu\n", m_cache_name.c_str(),
-              mem_access_type_str((enum mem_access_type)type),
-              cache_request_status_str((enum cache_request_status)status),
-              m_stats[type][status]);
+  for (auto iter = m_stats.begin(); iter != m_stats.end(); ++iter) {
+    unsigned long long streamid = iter->first;
+    // when streamID is specified, skip stats for all other streams, otherwise,
+    // print stats from all streams
+    if ((streamID != -1) && (streamid != streamID)) continue;
+    total_access.clear();
+    total_access.resize(NUM_MEM_ACCESS_TYPE, 0);
+    for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+      for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
+        fprintf(fout, "\t%s[%s][%s] = %llu\n", m_cache_name.c_str(),
+                mem_access_type_str((enum mem_access_type)type),
+                cache_request_status_str((enum cache_request_status)status),
+                m_stats.at(streamid)[type][status]);
 
-      if (status != RESERVATION_FAIL)
-        total_access[type] += m_stats[type][status];
+        if (status != RESERVATION_FAIL && status != MSHR_HIT)
+          // MSHR_HIT is a special type of SECTOR_MISS
+          // so its already included in the SECTOR_MISS
+          total_access[type] += m_stats.at(streamid)[type][status];
+      }
+    }
+    for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+      if (total_access[type] > 0)
+        fprintf(fout, "\t%s[%s][%s] = %u\n", m_cache_name.c_str(),
+                mem_access_type_str((enum mem_access_type)type), "TOTAL_ACCESS",
+                total_access[type]);
     }
-  }
-  for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
-    if (total_access[type] > 0)
-      fprintf(fout, "\t%s[%s][%s] = %u\n", m_cache_name.c_str(),
-              mem_access_type_str((enum mem_access_type)type), "TOTAL_ACCESS",
-              total_access[type]);
   }
 }
 
-void cache_stats::print_fail_stats(FILE *fout, const char *cache_name) const {
+void cache_stats::print_fail_stats(FILE *fout, unsigned long long streamID,
+                                   const char *cache_name) const {
   std::string m_cache_name = cache_name;
-  for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
-    for (unsigned fail = 0; fail < NUM_CACHE_RESERVATION_FAIL_STATUS; ++fail) {
-      if (m_fail_stats[type][fail] > 0) {
-        fprintf(fout, "\t%s[%s][%s] = %llu\n", m_cache_name.c_str(),
-                mem_access_type_str((enum mem_access_type)type),
-                cache_fail_status_str((enum cache_reservation_fail_reason)fail),
-                m_fail_stats[type][fail]);
+  for (auto iter = m_fail_stats.begin(); iter != m_fail_stats.end(); ++iter) {
+    unsigned long long streamid = iter->first;
+    // when streamID is specified, skip stats for all other streams, otherwise,
+    // print stats from all streams
+    if ((streamID != -1) && (streamid != streamID)) continue;
+    for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+      for (unsigned fail = 0; fail < NUM_CACHE_RESERVATION_FAIL_STATUS;
+           ++fail) {
+        if (m_fail_stats.at(streamid)[type][fail] > 0) {
+          fprintf(
+              fout, "\t%s[%s][%s] = %llu\n", m_cache_name.c_str(),
+              mem_access_type_str((enum mem_access_type)type),
+              cache_fail_status_str((enum cache_reservation_fail_reason)fail),
+              m_fail_stats.at(streamid)[type][fail]);
+        }
       }
     }
   }
@@ -822,11 +1005,14 @@ unsigned long long cache_stats::get_stats(
   /// cache_request_statuses.
   ///
   unsigned long long total = 0;
-  for (unsigned type = 0; type < num_access_type; ++type) {
-    for (unsigned status = 0; status < num_access_status; ++status) {
-      if (!check_valid((int)access_type[type], (int)access_status[status]))
-        assert(0 && "Unknown cache access type or access outcome");
-      total += m_stats[access_type[type]][access_status[status]];
+  for (auto iter = m_stats.begin(); iter != m_stats.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    for (unsigned type = 0; type < num_access_type; ++type) {
+      for (unsigned status = 0; status < num_access_status; ++status) {
+        if (!check_valid((int)access_type[type], (int)access_status[status]))
+          assert(0 && "Unknown cache access type or access outcome");
+        total += m_stats.at(streamID)[access_type[type]][access_status[status]];
+      }
     }
   }
   return total;
@@ -839,18 +1025,23 @@ void cache_stats::get_sub_stats(struct cache_sub_stats &css) const {
   struct cache_sub_stats t_css;
   t_css.clear();
 
-  for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
-    for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
-      if (status == HIT || status == MISS || status == SECTOR_MISS ||
-          status == HIT_RESERVED)
-        t_css.accesses += m_stats[type][status];
+  for (auto iter = m_stats.begin(); iter != m_stats.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+      for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
+        if (status == HIT || status == MISS || status == SECTOR_MISS ||
+            status == HIT_RESERVED)
+          t_css.accesses += m_stats.at(streamID)[type][status];
 
-      if (status == MISS || status == SECTOR_MISS)
-        t_css.misses += m_stats[type][status];
+        if (status == MISS || status == SECTOR_MISS)
+          t_css.misses += m_stats.at(streamID)[type][status];
 
-      if (status == HIT_RESERVED) t_css.pending_hits += m_stats[type][status];
+        if (status == HIT_RESERVED)
+          t_css.pending_hits += m_stats.at(streamID)[type][status];
 
-      if (status == RESERVATION_FAIL) t_css.res_fails += m_stats[type][status];
+        if (status == RESERVATION_FAIL)
+          t_css.res_fails += m_stats.at(streamID)[type][status];
+      }
     }
   }
 
@@ -868,41 +1059,48 @@ void cache_stats::get_sub_stats_pw(struct cache_sub_stats_pw &css) const {
   struct cache_sub_stats_pw t_css;
   t_css.clear();
 
-  for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
-    for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
-      if (status == HIT || status == MISS || status == SECTOR_MISS ||
-          status == HIT_RESERVED)
-        t_css.accesses += m_stats_pw[type][status];
-
-      if (status == HIT) {
-        if (type == GLOBAL_ACC_R || type == CONST_ACC_R || type == INST_ACC_R) {
-          t_css.read_hits += m_stats_pw[type][status];
-        } else if (type == GLOBAL_ACC_W) {
-          t_css.write_hits += m_stats_pw[type][status];
+  for (auto iter = m_stats_pw.begin(); iter != m_stats_pw.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+      for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
+        if (status == HIT || status == MISS || status == SECTOR_MISS ||
+            status == HIT_RESERVED)
+          t_css.accesses += m_stats_pw.at(streamID)[type][status];
+
+        if (status == HIT) {
+          if (type == GLOBAL_ACC_R || type == CONST_ACC_R ||
+              type == INST_ACC_R) {
+            t_css.read_hits += m_stats_pw.at(streamID)[type][status];
+          } else if (type == GLOBAL_ACC_W) {
+            t_css.write_hits += m_stats_pw.at(streamID)[type][status];
+          }
         }
-      }
 
-      if (status == MISS || status == SECTOR_MISS) {
-        if (type == GLOBAL_ACC_R || type == CONST_ACC_R || type == INST_ACC_R) {
-          t_css.read_misses += m_stats_pw[type][status];
-        } else if (type == GLOBAL_ACC_W) {
-          t_css.write_misses += m_stats_pw[type][status];
+        if (status == MISS || status == SECTOR_MISS) {
+          if (type == GLOBAL_ACC_R || type == CONST_ACC_R ||
+              type == INST_ACC_R) {
+            t_css.read_misses += m_stats_pw.at(streamID)[type][status];
+          } else if (type == GLOBAL_ACC_W) {
+            t_css.write_misses += m_stats_pw.at(streamID)[type][status];
+          }
         }
-      }
 
-      if (status == HIT_RESERVED) {
-        if (type == GLOBAL_ACC_R || type == CONST_ACC_R || type == INST_ACC_R) {
-          t_css.read_pending_hits += m_stats_pw[type][status];
-        } else if (type == GLOBAL_ACC_W) {
-          t_css.write_pending_hits += m_stats_pw[type][status];
+        if (status == HIT_RESERVED) {
+          if (type == GLOBAL_ACC_R || type == CONST_ACC_R ||
+              type == INST_ACC_R) {
+            t_css.read_pending_hits += m_stats_pw.at(streamID)[type][status];
+          } else if (type == GLOBAL_ACC_W) {
+            t_css.write_pending_hits += m_stats_pw.at(streamID)[type][status];
+          }
         }
-      }
 
-      if (status == RESERVATION_FAIL) {
-        if (type == GLOBAL_ACC_R || type == CONST_ACC_R || type == INST_ACC_R) {
-          t_css.read_res_fails += m_stats_pw[type][status];
-        } else if (type == GLOBAL_ACC_W) {
-          t_css.write_res_fails += m_stats_pw[type][status];
+        if (status == RESERVATION_FAIL) {
+          if (type == GLOBAL_ACC_R || type == CONST_ACC_R ||
+              type == INST_ACC_R) {
+            t_css.read_res_fails += m_stats_pw.at(streamID)[type][status];
+          } else if (type == GLOBAL_ACC_W) {
+            t_css.write_res_fails += m_stats_pw.at(streamID)[type][status];
+          }
         }
       }
     }
@@ -1057,8 +1255,7 @@ void baseline_cache::fill(mem_fetch *mf, unsigned time) {
   if (m_config.m_alloc_policy == ON_MISS)
     m_tag_array->fill(e->second.m_cache_index, time, mf);
   else if (m_config.m_alloc_policy == ON_FILL) {
-    m_tag_array->fill(e->second.m_block_addr, time, mf);
-    if (m_config.is_streaming()) m_tag_array->remove_pending_line(mf);
+    m_tag_array->fill(e->second.m_block_addr, time, mf, mf->is_write());
   } else
     abort();
   bool has_atomic = false;
@@ -1066,9 +1263,13 @@ void baseline_cache::fill(mem_fetch *mf, unsigned time) {
   if (has_atomic) {
     assert(m_config.m_alloc_policy == ON_MISS);
     cache_block_t *block = m_tag_array->get_block(e->second.m_cache_index);
+    if (!block->is_modified_line()) {
+      m_tag_array->inc_dirty();
+    }
     block->set_status(MODIFIED,
                       mf->get_access_sector_mask());  // mark line as dirty for
                                                       // atomic operation
+    block->set_byte_mask(mf);
   }
   m_extra_mf_fields.erase(mf);
   m_bandwidth_management.use_fill_port(mf);
@@ -1092,6 +1293,50 @@ void baseline_cache::display_state(FILE *fp) const {
   fprintf(fp, "\n");
 }
 
+void baseline_cache::inc_aggregated_stats(cache_request_status status,
+                                          cache_request_status cache_status,
+                                          mem_fetch *mf,
+                                          enum cache_gpu_level level) {
+  if (level == L1_GPU_CACHE) {
+    m_gpu->aggregated_l1_stats.inc_stats(
+        mf->get_streamID(), mf->get_access_type(),
+        m_gpu->aggregated_l1_stats.select_stats_status(status, cache_status));
+  } else if (level == L2_GPU_CACHE) {
+    m_gpu->aggregated_l2_stats.inc_stats(
+        mf->get_streamID(), mf->get_access_type(),
+        m_gpu->aggregated_l2_stats.select_stats_status(status, cache_status));
+  }
+}
+
+void baseline_cache::inc_aggregated_fail_stats(
+    cache_request_status status, cache_request_status cache_status,
+    mem_fetch *mf, enum cache_gpu_level level) {
+  if (level == L1_GPU_CACHE) {
+    m_gpu->aggregated_l1_stats.inc_fail_stats(
+        mf->get_streamID(), mf->get_access_type(),
+        m_gpu->aggregated_l1_stats.select_stats_status(status, cache_status));
+  } else if (level == L2_GPU_CACHE) {
+    m_gpu->aggregated_l2_stats.inc_fail_stats(
+        mf->get_streamID(), mf->get_access_type(),
+        m_gpu->aggregated_l2_stats.select_stats_status(status, cache_status));
+  }
+}
+
+void baseline_cache::inc_aggregated_stats_pw(cache_request_status status,
+                                             cache_request_status cache_status,
+                                             mem_fetch *mf,
+                                             enum cache_gpu_level level) {
+  if (level == L1_GPU_CACHE) {
+    m_gpu->aggregated_l1_stats.inc_stats_pw(
+        mf->get_streamID(), mf->get_access_type(),
+        m_gpu->aggregated_l1_stats.select_stats_status(status, cache_status));
+  } else if (level == L2_GPU_CACHE) {
+    m_gpu->aggregated_l2_stats.inc_stats_pw(
+        mf->get_streamID(), mf->get_access_type(),
+        m_gpu->aggregated_l2_stats.select_stats_status(status, cache_status));
+  }
+}
+
 /// Read miss handler without writeback
 void baseline_cache::send_read_request(new_addr_type addr,
                                        new_addr_type block_addr,
@@ -1123,6 +1368,7 @@ void baseline_cache::send_read_request(new_addr_type addr,
       m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
 
     m_mshrs.add(mshr_addr, mf);
+    m_stats.inc_stats(mf->get_access_type(), MSHR_HIT, mf->get_streamID());
     do_miss = true;
 
   } else if (!mshr_hit && mshr_avail &&
@@ -1133,9 +1379,6 @@ void baseline_cache::send_read_request(new_addr_type addr,
       m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
 
     m_mshrs.add(mshr_addr, mf);
-    if (m_config.is_streaming() && m_config.m_cache_type == SECTOR) {
-      m_tag_array->add_pending_line(mf);
-    }
     m_extra_mf_fields[mf] = extra_mf_fields(
         mshr_addr, mf->get_addr(), cache_index, mf->get_data_size(), m_config);
     mf->set_data_size(m_config.get_atom_sz());
@@ -1146,9 +1389,11 @@ void baseline_cache::send_read_request(new_addr_type addr,
 
     do_miss = true;
   } else if (mshr_hit && !mshr_avail)
-    m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL);
+    m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL,
+                           mf->get_streamID());
   else if (!mshr_hit && !mshr_avail)
-    m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL);
+    m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL,
+                           mf->get_streamID());
   else
     assert(0);
 }
@@ -1162,6 +1407,24 @@ void data_cache::send_write_request(mem_fetch *mf, cache_event request,
   mf->set_status(m_miss_queue_status, time);
 }
 
+void data_cache::update_m_readable(mem_fetch *mf, unsigned cache_index) {
+  cache_block_t *block = m_tag_array->get_block(cache_index);
+  for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+    if (mf->get_access_sector_mask().test(i)) {
+      bool all_set = true;
+      for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+        // If any bit in the byte mask (within the sector) is not set,
+        // the sector is unreadble
+        if (!block->get_dirty_byte_mask().test(k)) {
+          all_set = false;
+          break;
+        }
+      }
+      if (all_set) block->set_m_readable(true, mf->get_access_sector_mask());
+    }
+  }
+}
+
 /****** Write-hit functions (Set by config file) ******/
 
 /// Write-back hit: Mark block as modified
@@ -1173,7 +1436,12 @@ cache_request_status data_cache::wr_hit_wb(new_addr_type addr,
   new_addr_type block_addr = m_config.block_addr(addr);
   m_tag_array->access(block_addr, time, cache_index, mf);  // update LRU state
   cache_block_t *block = m_tag_array->get_block(cache_index);
+  if (!block->is_modified_line()) {
+    m_tag_array->inc_dirty();
+  }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
+  block->set_byte_mask(mf);
+  update_m_readable(mf, cache_index);
 
   return HIT;
 }
@@ -1185,14 +1453,20 @@ cache_request_status data_cache::wr_hit_wt(new_addr_type addr,
                                            std::list<cache_event> &events,
                                            enum cache_request_status status) {
   if (miss_queue_full(0)) {
-    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                           mf->get_streamID());
     return RESERVATION_FAIL;  // cannot handle request this cycle
   }
 
   new_addr_type block_addr = m_config.block_addr(addr);
   m_tag_array->access(block_addr, time, cache_index, mf);  // update LRU state
   cache_block_t *block = m_tag_array->get_block(cache_index);
+  if (!block->is_modified_line()) {
+    m_tag_array->inc_dirty();
+  }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
+  block->set_byte_mask(mf);
+  update_m_readable(mf, cache_index);
 
   // generate a write-through
   send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
@@ -1208,7 +1482,8 @@ cache_request_status data_cache::wr_hit_we(new_addr_type addr,
                                            std::list<cache_event> &events,
                                            enum cache_request_status status) {
   if (miss_queue_full(0)) {
-    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                           mf->get_streamID());
     return RESERVATION_FAIL;  // cannot handle request this cycle
   }
 
@@ -1257,11 +1532,14 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
          (m_miss_queue.size() < m_config.m_miss_queue_size)))) {
     // check what is the exactly the failure reason
     if (miss_queue_full(2))
-      m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+      m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                             mf->get_streamID());
     else if (mshr_hit && !mshr_avail)
-      m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL);
+      m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL,
+                             mf->get_streamID());
     else if (!mshr_hit && !mshr_avail)
-      m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL);
+      m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL,
+                             mf->get_streamID());
     else
       assert(0);
 
@@ -1280,10 +1558,10 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
                        mf->get_access_warp_mask(), mf->get_access_byte_mask(),
                        mf->get_access_sector_mask(), m_gpu->gpgpu_ctx);
 
-  mem_fetch *n_mf =
-      new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
-                    mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
-                    m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+  mem_fetch *n_mf = new mem_fetch(
+      *ma, NULL, mf->get_streamID(), mf->get_ctrl_size(), mf->get_wid(),
+      mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
+      m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
 
   bool do_miss = false;
   bool wb = false;
@@ -1302,12 +1580,14 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
       assert(status ==
              MISS);  // SECTOR_MISS and HIT_RESERVED should not send write back
       mem_fetch *wb = m_memfetch_creator->alloc(
-          evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+          evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+          evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+          true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+          NULL, mf->get_streamID());
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
-      wb->set_parition(mf->get_tlx_addr().sub_partition);
+      wb->set_partition(mf->get_tlx_addr().sub_partition);
       send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
                          time, events);
     }
@@ -1329,7 +1609,8 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
     // reserve mshr
 
     if (miss_queue_full(0)) {
-      m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+      m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                             mf->get_streamID());
       return RESERVATION_FAIL;  // cannot handle request this cycle
     }
 
@@ -1340,7 +1621,11 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
         m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
     assert(status != HIT);
     cache_block_t *block = m_tag_array->get_block(cache_index);
+    if (!block->is_modified_line()) {
+      m_tag_array->inc_dirty();
+    }
     block->set_status(MODIFIED, mf->get_access_sector_mask());
+    block->set_byte_mask(mf);
     if (status == HIT_RESERVED)
       block->set_ignore_on_fill(true, mf->get_access_sector_mask());
 
@@ -1349,12 +1634,14 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
       // (already modified lower level)
       if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
         mem_fetch *wb = m_memfetch_creator->alloc(
-            evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-            m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+            evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+            evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+            true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+            NULL, mf->get_streamID());
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
-        wb->set_parition(mf->get_tlx_addr().sub_partition);
+        wb->set_partition(mf->get_tlx_addr().sub_partition);
         send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
                            time, events);
       }
@@ -1370,11 +1657,14 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
            (m_miss_queue.size() < m_config.m_miss_queue_size)))) {
       // check what is the exactly the failure reason
       if (miss_queue_full(1))
-        m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+        m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                               mf->get_streamID());
       else if (mshr_hit && !mshr_avail)
-        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL);
+        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL,
+                               mf->get_streamID());
       else if (!mshr_hit && !mshr_avail)
-        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL);
+        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL,
+                               mf->get_streamID());
       else
         assert(0);
 
@@ -1387,7 +1677,8 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
     if (m_mshrs.probe(mshr_addr) &&
         m_mshrs.is_read_after_write_pending(mshr_addr) && mf->is_write()) {
       // assert(0);
-      m_stats.inc_fail_stats(mf->get_access_type(), MSHR_RW_PENDING);
+      m_stats.inc_fail_stats(mf->get_access_type(), MSHR_RW_PENDING,
+                             mf->get_streamID());
       return RESERVATION_FAIL;
     }
 
@@ -1398,8 +1689,8 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
         mf->get_access_sector_mask(), m_gpu->gpgpu_ctx);
 
     mem_fetch *n_mf = new mem_fetch(
-        *ma, NULL, mf->get_ctrl_size(), mf->get_wid(), mf->get_sid(),
-        mf->get_tpc(), mf->get_mem_config(),
+        *ma, NULL, mf->get_streamID(), mf->get_ctrl_size(), mf->get_wid(),
+        mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
         m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, NULL, mf);
 
     new_addr_type block_addr = m_config.block_addr(addr);
@@ -1411,6 +1702,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
 
     cache_block_t *block = m_tag_array->get_block(cache_index);
     block->set_modified_on_fill(true, mf->get_access_sector_mask());
+    block->set_byte_mask_on_fill(true);
 
     events.push_back(cache_event(WRITE_ALLOCATE_SENT));
 
@@ -1419,12 +1711,14 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
       // (already modified lower level)
       if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
         mem_fetch *wb = m_memfetch_creator->alloc(
-            evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-            m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+            evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+            evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+            true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+            NULL, mf->get_streamID());
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
-        wb->set_parition(mf->get_tlx_addr().sub_partition);
+        wb->set_partition(mf->get_tlx_addr().sub_partition);
         send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
                            time, events);
       }
@@ -1444,10 +1738,15 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
   // mshr
 
   if (miss_queue_full(0)) {
-    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                           mf->get_streamID());
     return RESERVATION_FAIL;  // cannot handle request this cycle
   }
 
+  if (m_config.m_write_policy == WRITE_THROUGH) {
+    send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
+  }
+
   bool wb = false;
   evicted_block_info evicted;
 
@@ -1455,29 +1754,39 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
       m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
   assert(m_status != HIT);
   cache_block_t *block = m_tag_array->get_block(cache_index);
+  if (!block->is_modified_line()) {
+    m_tag_array->inc_dirty();
+  }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
+  block->set_byte_mask(mf);
   if (m_status == HIT_RESERVED) {
     block->set_ignore_on_fill(true, mf->get_access_sector_mask());
     block->set_modified_on_fill(true, mf->get_access_sector_mask());
+    block->set_byte_mask_on_fill(true);
   }
 
   if (mf->get_access_byte_mask().count() == m_config.get_atom_sz()) {
     block->set_m_readable(true, mf->get_access_sector_mask());
   } else {
     block->set_m_readable(false, mf->get_access_sector_mask());
+    if (m_status == HIT_RESERVED)
+      block->set_readable_on_fill(true, mf->get_access_sector_mask());
   }
+  update_m_readable(mf, cache_index);
 
   if (m_status != RESERVATION_FAIL) {
     // If evicted block is modified and not a write-through
     // (already modified lower level)
     if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
       mem_fetch *wb = m_memfetch_creator->alloc(
-          evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+          evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+          evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+          true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+          NULL, mf->get_streamID());
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
-      wb->set_parition(mf->get_tlx_addr().sub_partition);
+      wb->set_partition(mf->get_tlx_addr().sub_partition);
       send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
                          time, events);
     }
@@ -1491,7 +1800,8 @@ enum cache_request_status data_cache::wr_miss_no_wa(
     new_addr_type addr, unsigned cache_index, mem_fetch *mf, unsigned time,
     std::list<cache_event> &events, enum cache_request_status status) {
   if (miss_queue_full(0)) {
-    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                           mf->get_streamID());
     return RESERVATION_FAIL;  // cannot handle request this cycle
   }
 
@@ -1516,8 +1826,12 @@ enum cache_request_status data_cache::rd_hit_base(
   if (mf->isatomic()) {
     assert(mf->get_access_type() == GLOBAL_ACC_R);
     cache_block_t *block = m_tag_array->get_block(cache_index);
+    if (!block->is_modified_line()) {
+      m_tag_array->inc_dirty();
+    }
     block->set_status(MODIFIED,
-                      mf->get_access_sector_mask());  // mark line as dirty
+                      mf->get_access_sector_mask());  // mark line as
+    block->set_byte_mask(mf);
   }
   return HIT;
 }
@@ -1532,7 +1846,8 @@ enum cache_request_status data_cache::rd_miss_base(
   if (miss_queue_full(1)) {
     // cannot handle request this cycle
     // (might need to generate two requests)
-    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                           mf->get_streamID());
     return RESERVATION_FAIL;
   }
 
@@ -1548,12 +1863,14 @@ enum cache_request_status data_cache::rd_miss_base(
     // (already modified lower level)
     if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
       mem_fetch *wb = m_memfetch_creator->alloc(
-          evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+          evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+          evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+          true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+          NULL, mf->get_streamID());
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
-      wb->set_parition(mf->get_tlx_addr().sub_partition);
+      wb->set_partition(mf->get_tlx_addr().sub_partition);
       send_write_request(wb, WRITE_BACK_REQUEST_SENT, time, events);
     }
     return MISS;
@@ -1572,7 +1889,7 @@ enum cache_request_status read_only_cache::access(
   new_addr_type block_addr = m_config.block_addr(addr);
   unsigned cache_index = (unsigned)-1;
   enum cache_request_status status =
-      m_tag_array->probe(block_addr, cache_index, mf);
+      m_tag_array->probe(block_addr, cache_index, mf, mf->is_write());
   enum cache_request_status cache_status = RESERVATION_FAIL;
 
   if (status == HIT) {
@@ -1589,16 +1906,20 @@ enum cache_request_status read_only_cache::access(
         cache_status = RESERVATION_FAIL;
     } else {
       cache_status = RESERVATION_FAIL;
-      m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+      m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                             mf->get_streamID());
     }
   } else {
-    m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL);
+    m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL,
+                           mf->get_streamID());
   }
 
   m_stats.inc_stats(mf->get_access_type(),
-                    m_stats.select_stats_status(status, cache_status));
+                    m_stats.select_stats_status(status, cache_status),
+                    mf->get_streamID());
   m_stats.inc_stats_pw(mf->get_access_type(),
-                       m_stats.select_stats_status(status, cache_status));
+                       m_stats.select_stats_status(status, cache_status),
+                       mf->get_streamID());
   return cache_status;
 }
 
@@ -1626,7 +1947,8 @@ enum cache_request_status data_cache::process_tag_probe(
     } else {
       // the only reason for reservation fail here is LINE_ALLOC_FAIL (i.e all
       // lines are reserved)
-      m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL);
+      m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL,
+                             mf->get_streamID());
     }
   } else {  // Read
     if (probe_status == HIT) {
@@ -1638,7 +1960,8 @@ enum cache_request_status data_cache::process_tag_probe(
     } else {
       // the only reason for reservation fail here is LINE_ALLOC_FAIL (i.e all
       // lines are reserved)
-      m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL);
+      m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL,
+                             mf->get_streamID());
     }
   }
 
@@ -1659,13 +1982,15 @@ enum cache_request_status data_cache::access(new_addr_type addr, mem_fetch *mf,
   new_addr_type block_addr = m_config.block_addr(addr);
   unsigned cache_index = (unsigned)-1;
   enum cache_request_status probe_status =
-      m_tag_array->probe(block_addr, cache_index, mf, true);
+      m_tag_array->probe(block_addr, cache_index, mf, mf->is_write(), true);
   enum cache_request_status access_status =
       process_tag_probe(wr, probe_status, addr, cache_index, mf, time, events);
   m_stats.inc_stats(mf->get_access_type(),
-                    m_stats.select_stats_status(probe_status, access_status));
-  m_stats.inc_stats_pw(mf->get_access_type(), m_stats.select_stats_status(
-                                                  probe_status, access_status));
+                    m_stats.select_stats_status(probe_status, access_status),
+                    mf->get_streamID());
+  m_stats.inc_stats_pw(mf->get_access_type(),
+                       m_stats.select_stats_status(probe_status, access_status),
+                       mf->get_streamID());
   return access_status;
 }
 
@@ -1727,14 +2052,17 @@ enum cache_request_status tex_cache::access(new_addr_type addr, mem_fetch *mf,
     cache_status = HIT_RESERVED;
   }
   m_stats.inc_stats(mf->get_access_type(),
-                    m_stats.select_stats_status(status, cache_status));
+                    m_stats.select_stats_status(status, cache_status),
+                    mf->get_streamID());
   m_stats.inc_stats_pw(mf->get_access_type(),
-                       m_stats.select_stats_status(status, cache_status));
+                       m_stats.select_stats_status(status, cache_status),
+                       mf->get_streamID());
   return cache_status;
 }
 
 void tex_cache::cycle() {
   // send next request to lower level of memory
+  // TODO: Use different full() for sst_mem_interface?
   if (!m_request_fifo.empty()) {
     mem_fetch *mf = m_request_fifo.peek();
     if (!m_memport->full(mf->get_ctrl_size(), false)) {
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 5c28b41f6..c07695fa8 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -1,18 +1,21 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Vijay Kandiah,
+// Nikos Hardavellas, Mahmoud Khairy, Junrui Pan, Timothy G. Rogers The
+// University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -49,6 +52,7 @@ enum cache_request_status {
   MISS,
   RESERVATION_FAIL,
   SECTOR_MISS,
+  MSHR_HIT,
   NUM_CACHE_REQUEST_STATUS
 };
 
@@ -68,17 +72,36 @@ enum cache_event_type {
   WRITE_ALLOCATE_SENT
 };
 
+enum cache_gpu_level {
+  L1_GPU_CACHE = 0,
+  L2_GPU_CACHE,
+  OTHER_GPU_CACHE,
+  NUM_CACHE_GPU_LEVELS
+};
+
 struct evicted_block_info {
   new_addr_type m_block_addr;
   unsigned m_modified_size;
+  mem_access_byte_mask_t m_byte_mask;
+  mem_access_sector_mask_t m_sector_mask;
   evicted_block_info() {
     m_block_addr = 0;
     m_modified_size = 0;
+    m_byte_mask.reset();
+    m_sector_mask.reset();
   }
   void set_info(new_addr_type block_addr, unsigned modified_size) {
     m_block_addr = block_addr;
     m_modified_size = modified_size;
   }
+  void set_info(new_addr_type block_addr, unsigned modified_size,
+                mem_access_byte_mask_t byte_mask,
+                mem_access_sector_mask_t sector_mask) {
+    m_block_addr = block_addr;
+    m_modified_size = modified_size;
+    m_byte_mask = byte_mask;
+    m_sector_mask = sector_mask;
+  }
 };
 
 struct cache_event {
@@ -108,7 +131,8 @@ struct cache_block_t {
   virtual void allocate(new_addr_type tag, new_addr_type block_addr,
                         unsigned time,
                         mem_access_sector_mask_t sector_mask) = 0;
-  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask) = 0;
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask,
+                    mem_access_byte_mask_t byte_mask) = 0;
 
   virtual bool is_invalid_line() = 0;
   virtual bool is_valid_line() = 0;
@@ -119,7 +143,10 @@ struct cache_block_t {
       mem_access_sector_mask_t sector_mask) = 0;
   virtual void set_status(enum cache_block_state m_status,
                           mem_access_sector_mask_t sector_mask) = 0;
-
+  virtual void set_byte_mask(mem_fetch *mf) = 0;
+  virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) = 0;
+  virtual mem_access_byte_mask_t get_dirty_byte_mask() = 0;
+  virtual mem_access_sector_mask_t get_dirty_sector_mask() = 0;
   virtual unsigned long long get_last_access_time() = 0;
   virtual void set_last_access_time(unsigned long long time,
                                     mem_access_sector_mask_t sector_mask) = 0;
@@ -128,6 +155,9 @@ struct cache_block_t {
                                   mem_access_sector_mask_t sector_mask) = 0;
   virtual void set_modified_on_fill(bool m_modified,
                                     mem_access_sector_mask_t sector_mask) = 0;
+  virtual void set_readable_on_fill(bool readable,
+                                    mem_access_sector_mask_t sector_mask) = 0;
+  virtual void set_byte_mask_on_fill(bool m_modified) = 0;
   virtual unsigned get_modified_size() = 0;
   virtual void set_m_readable(bool readable,
                               mem_access_sector_mask_t sector_mask) = 0;
@@ -147,6 +177,7 @@ struct line_cache_block : public cache_block_t {
     m_status = INVALID;
     m_ignore_on_fill_status = false;
     m_set_modified_on_fill = false;
+    m_set_readable_on_fill = false;
     m_readable = true;
   }
   void allocate(new_addr_type tag, new_addr_type block_addr, unsigned time,
@@ -159,13 +190,19 @@ struct line_cache_block : public cache_block_t {
     m_status = RESERVED;
     m_ignore_on_fill_status = false;
     m_set_modified_on_fill = false;
+    m_set_readable_on_fill = false;
+    m_set_byte_mask_on_fill = false;
   }
-  void fill(unsigned time, mem_access_sector_mask_t sector_mask) {
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask,
+                    mem_access_byte_mask_t byte_mask) {
     // if(!m_ignore_on_fill_status)
     //	assert( m_status == RESERVED );
 
     m_status = m_set_modified_on_fill ? MODIFIED : VALID;
 
+    if (m_set_readable_on_fill) m_readable = true;
+    if (m_set_byte_mask_on_fill) set_byte_mask(byte_mask);
+
     m_fill_time = time;
   }
   virtual bool is_invalid_line() { return m_status == INVALID; }
@@ -181,6 +218,20 @@ struct line_cache_block : public cache_block_t {
                           mem_access_sector_mask_t sector_mask) {
     m_status = status;
   }
+  virtual void set_byte_mask(mem_fetch *mf) {
+    m_dirty_byte_mask = m_dirty_byte_mask | mf->get_access_byte_mask();
+  }
+  virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
+    m_dirty_byte_mask = m_dirty_byte_mask | byte_mask;
+  }
+  virtual mem_access_byte_mask_t get_dirty_byte_mask() {
+    return m_dirty_byte_mask;
+  }
+  virtual mem_access_sector_mask_t get_dirty_sector_mask() {
+    mem_access_sector_mask_t sector_mask;
+    if (m_status == MODIFIED) sector_mask.set();
+    return sector_mask;
+  }
   virtual unsigned long long get_last_access_time() {
     return m_last_access_time;
   }
@@ -197,6 +248,13 @@ struct line_cache_block : public cache_block_t {
                                     mem_access_sector_mask_t sector_mask) {
     m_set_modified_on_fill = m_modified;
   }
+  virtual void set_readable_on_fill(bool readable,
+                                    mem_access_sector_mask_t sector_mask) {
+    m_set_readable_on_fill = readable;
+  }
+  virtual void set_byte_mask_on_fill(bool m_modified) {
+    m_set_byte_mask_on_fill = m_modified;
+  }
   virtual unsigned get_modified_size() {
     return SECTOR_CHUNCK_SIZE * SECTOR_SIZE;  // i.e. cache line size
   }
@@ -218,7 +276,10 @@ struct line_cache_block : public cache_block_t {
   cache_block_state m_status;
   bool m_ignore_on_fill_status;
   bool m_set_modified_on_fill;
+  bool m_set_readable_on_fill;
+  bool m_set_byte_mask_on_fill;
   bool m_readable;
+  mem_access_byte_mask_t m_dirty_byte_mask;
 };
 
 struct sector_cache_block : public cache_block_t {
@@ -232,11 +293,13 @@ struct sector_cache_block : public cache_block_t {
       m_status[i] = INVALID;
       m_ignore_on_fill_status[i] = false;
       m_set_modified_on_fill[i] = false;
+      m_set_readable_on_fill[i] = false;
       m_readable[i] = true;
     }
     m_line_alloc_time = 0;
     m_line_last_access_time = 0;
     m_line_fill_time = 0;
+    m_dirty_byte_mask.reset();
   }
 
   virtual void allocate(new_addr_type tag, new_addr_type block_addr,
@@ -261,6 +324,8 @@ struct sector_cache_block : public cache_block_t {
     m_status[sidx] = RESERVED;
     m_ignore_on_fill_status[sidx] = false;
     m_set_modified_on_fill[sidx] = false;
+    m_set_readable_on_fill[sidx] = false;
+    m_set_byte_mask_on_fill = false;
 
     // set line stats
     m_line_alloc_time = time;  // only set this for the first allocated sector
@@ -283,6 +348,8 @@ struct sector_cache_block : public cache_block_t {
     else
       m_set_modified_on_fill[sidx] = false;
 
+    m_set_readable_on_fill[sidx] = false;
+
     m_status[sidx] = RESERVED;
     m_ignore_on_fill_status[sidx] = false;
     // m_set_modified_on_fill[sidx] = false;
@@ -293,14 +360,20 @@ struct sector_cache_block : public cache_block_t {
     m_line_fill_time = 0;
   }
 
-  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask) {
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask,
+                    mem_access_byte_mask_t byte_mask) {
     unsigned sidx = get_sector_index(sector_mask);
 
     //	if(!m_ignore_on_fill_status[sidx])
     //	         assert( m_status[sidx] == RESERVED );
-
     m_status[sidx] = m_set_modified_on_fill[sidx] ? MODIFIED : VALID;
 
+    if (m_set_readable_on_fill[sidx]) {
+      m_readable[sidx] = true;
+      m_set_readable_on_fill[sidx] = false;
+    }
+    if (m_set_byte_mask_on_fill) set_byte_mask(byte_mask);
+
     m_sector_fill_time[sidx] = time;
     m_line_fill_time = time;
   }
@@ -340,6 +413,22 @@ struct sector_cache_block : public cache_block_t {
     m_status[sidx] = status;
   }
 
+  virtual void set_byte_mask(mem_fetch *mf) {
+    m_dirty_byte_mask = m_dirty_byte_mask | mf->get_access_byte_mask();
+  }
+  virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
+    m_dirty_byte_mask = m_dirty_byte_mask | byte_mask;
+  }
+  virtual mem_access_byte_mask_t get_dirty_byte_mask() {
+    return m_dirty_byte_mask;
+  }
+  virtual mem_access_sector_mask_t get_dirty_sector_mask() {
+    mem_access_sector_mask_t sector_mask;
+    for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+      if (m_status[i] == MODIFIED) sector_mask.set(i);
+    }
+    return sector_mask;
+  }
   virtual unsigned long long get_last_access_time() {
     return m_line_last_access_time;
   }
@@ -365,7 +454,15 @@ struct sector_cache_block : public cache_block_t {
     unsigned sidx = get_sector_index(sector_mask);
     m_set_modified_on_fill[sidx] = m_modified;
   }
+  virtual void set_byte_mask_on_fill(bool m_modified) {
+    m_set_byte_mask_on_fill = m_modified;
+  }
 
+  virtual void set_readable_on_fill(bool readable,
+                                    mem_access_sector_mask_t sector_mask) {
+    unsigned sidx = get_sector_index(sector_mask);
+    m_set_readable_on_fill[sidx] = readable;
+  }
   virtual void set_m_readable(bool readable,
                               mem_access_sector_mask_t sector_mask) {
     unsigned sidx = get_sector_index(sector_mask);
@@ -400,13 +497,17 @@ struct sector_cache_block : public cache_block_t {
   cache_block_state m_status[SECTOR_CHUNCK_SIZE];
   bool m_ignore_on_fill_status[SECTOR_CHUNCK_SIZE];
   bool m_set_modified_on_fill[SECTOR_CHUNCK_SIZE];
+  bool m_set_readable_on_fill[SECTOR_CHUNCK_SIZE];
+  bool m_set_byte_mask_on_fill;
   bool m_readable[SECTOR_CHUNCK_SIZE];
+  mem_access_byte_mask_t m_dirty_byte_mask;
 
   unsigned get_sector_index(mem_access_sector_mask_t sector_mask) {
     assert(sector_mask.count() == 1);
     for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; ++i) {
       if (sector_mask.to_ulong() & (1 << i)) return i;
     }
+    return SECTOR_CHUNCK_SIZE;  // error
   }
 };
 
@@ -463,6 +564,7 @@ class cache_config {
     m_data_port_width = 0;
     m_set_index_function = LINEAR_SET_FUNCTION;
     m_is_streaming = false;
+    m_wr_percent = 0;
   }
   void init(char *config, FuncCache status) {
     cache_status = status;
@@ -503,16 +605,6 @@ class cache_config {
       default:
         exit_parse_error();
     }
-    switch (rp) {
-      case 'L':
-        m_replacement_policy = LRU;
-        break;
-      case 'F':
-        m_replacement_policy = FIFO;
-        break;
-      default:
-        exit_parse_error();
-    }
     switch (wp) {
       case 'R':
         m_write_policy = READ_ONLY;
@@ -546,22 +638,27 @@ class cache_config {
         exit_parse_error();
     }
     if (m_alloc_policy == STREAMING) {
-      // For streaming cache, we set the alloc policy to be on-fill to remove
-      // all line_alloc_fail stalls we set the MSHRs to be equal to max
-      // allocated cache lines. This is possible by moving TAG to be shared
-      // between cache line and MSHR enrty (i.e. for each cache line, there is
-      // an MSHR rntey associated with it) This is the easiest think we can
-      // think about to model (mimic) L1 streaming cache in Pascal and Volta
-      // Based on our microbenchmakrs, MSHRs entries have been increasing
-      // substantially in Pascal and Volta For more information about streaming
-      // cache, see:
-      // http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
-      // https://ieeexplore.ieee.org/document/8344474/
+      /*
+      For streaming cache:
+      (1) we set the alloc policy to be on-fill to remove all line_alloc_fail
+      stalls. if the whole memory is allocated to the L1 cache, then make the
+      allocation to be on_MISS otherwise, make it ON_FILL to eliminate line
+      allocation fails. i.e. MSHR throughput is the same, independent on the L1
+      cache size/associativity So, we set the allocation policy per kernel
+      basis, see shader.cc, max_cta() function
+
+      (2) We also set the MSHRs to be equal to max
+      allocated cache lines. This is possible by moving TAG to be shared
+      between cache line and MSHR enrty (i.e. for each cache line, there is
+      an MSHR rntey associated with it). This is the easiest think we can
+      think of to model (mimic) L1 streaming cache in Pascal and Volta
+
+      For more information about streaming cache, see:
+      http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+      https://ieeexplore.ieee.org/document/8344474/
+      */
       m_is_streaming = true;
       m_alloc_policy = ON_FILL;
-      m_mshr_entries = m_nset * m_assoc * MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
-      if (m_cache_type == SECTOR) m_mshr_entries *= SECTOR_CHUNCK_SIZE;
-      m_mshr_max_merge = MAX_WARP_PER_SM;
     }
     switch (mshr_type) {
       case 'F':
@@ -610,7 +707,8 @@ class cache_config {
     }
 
     // detect invalid configuration
-    if (m_alloc_policy == ON_FILL and m_write_policy == WRITE_BACK) {
+    if ((m_alloc_policy == ON_FILL || m_alloc_policy == STREAMING) and
+        m_write_policy == WRITE_BACK) {
       // A writeback cache with allocate-on-fill policy will inevitably lead to
       // deadlock: The deadlock happens when an incoming cache-fill evicts a
       // dirty line, generating a writeback request.  If the memory subsystem is
@@ -632,9 +730,16 @@ class cache_config {
           "Invalid cache configuration: FETCH_ON_WRITE and LAZY_FETCH_ON_READ "
           "cannot work properly with ON_FILL policy. Cache must be ON_MISS. ");
     }
+
     if (m_cache_type == SECTOR) {
-      assert(m_line_sz / SECTOR_SIZE == SECTOR_CHUNCK_SIZE &&
-             m_line_sz % SECTOR_SIZE == 0);
+      bool cond = m_line_sz / SECTOR_SIZE == SECTOR_CHUNCK_SIZE &&
+                  m_line_sz % SECTOR_SIZE == 0;
+      if (!cond) {
+        std::cerr << "error: For sector cache, the simulator uses hard-coded "
+                     "SECTOR_SIZE and SECTOR_CHUNCK_SIZE. The line size "
+                     "must be product of both values.\n";
+        assert(0);
+      }
     }
 
     // default: port to data array width and granularity = line size
@@ -656,6 +761,9 @@ class cache_config {
       case 'L':
         m_set_index_function = LINEAR_SET_FUNCTION;
         break;
+      case 'X':
+        m_set_index_function = BITWISE_XORING_FUNCTION;
+        break;
       default:
         exit_parse_error();
     }
@@ -675,11 +783,11 @@ class cache_config {
   }
   unsigned get_max_num_lines() const {
     assert(m_valid);
-    return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER * m_nset * original_m_assoc;
+    return get_max_cache_multiplier() * m_nset * original_m_assoc;
   }
   unsigned get_max_assoc() const {
     assert(m_valid);
-    return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER * original_m_assoc;
+    return get_max_cache_multiplier() * original_m_assoc;
   }
   void print(FILE *fp) const {
     fprintf(fp, "Size = %d B (%d Set x %d-way x %d byte line)\n",
@@ -688,6 +796,10 @@ class cache_config {
 
   virtual unsigned set_index(new_addr_type addr) const;
 
+  virtual unsigned get_max_cache_multiplier() const {
+    return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
+  }
+
   unsigned hash_function(new_addr_type addr, unsigned m_nset,
                          unsigned m_line_sz_log2, unsigned m_nset_log2,
                          unsigned m_index_function) const;
@@ -722,10 +834,18 @@ class cache_config {
   }
   bool is_streaming() { return m_is_streaming; }
   FuncCache get_cache_status() { return cache_status; }
+  void set_allocation_policy(enum allocation_policy_t alloc) {
+    m_alloc_policy = alloc;
+  }
   char *m_config_string;
   char *m_config_stringPrefL1;
   char *m_config_stringPrefShared;
   FuncCache cache_status;
+  unsigned m_wr_percent;
+  write_allocate_policy_t get_write_allocate_policy() {
+    return m_write_alloc_policy;
+  }
+  write_policy_t get_write_policy() { return m_write_policy; }
 
  protected:
   void exit_parse_error() {
@@ -789,16 +909,28 @@ class l1d_cache_config : public cache_config {
   l1d_cache_config() : cache_config() {}
   unsigned set_bank(new_addr_type addr) const;
   void init(char *config, FuncCache status) {
-    m_banks_byte_interleaving_log2 = LOGB2(l1_banks_byte_interleaving);
-    m_l1_banks_log2 = LOGB2(l1_banks);
+    l1_banks_byte_interleaving_log2 = LOGB2(l1_banks_byte_interleaving);
+    l1_banks_log2 = LOGB2(l1_banks);
     cache_config::init(config, status);
   }
   unsigned l1_latency;
   unsigned l1_banks;
-  unsigned m_l1_banks_log2;
+  unsigned l1_banks_log2;
   unsigned l1_banks_byte_interleaving;
-  unsigned m_banks_byte_interleaving_log2;
+  unsigned l1_banks_byte_interleaving_log2;
   unsigned l1_banks_hashing_function;
+  unsigned m_unified_cache_size;
+  virtual unsigned get_max_cache_multiplier() const {
+    // set * assoc * cacheline size. Then convert Byte to KB
+    // gpgpu_unified_cache_size is in KB while original_sz is in B
+    if (m_unified_cache_size > 0) {
+      unsigned original_size = m_nset * original_m_assoc * m_line_sz / 1024;
+      assert(m_unified_cache_size % original_size == 0);
+      return m_unified_cache_size / original_size;
+    } else {
+      return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
+    }
+  }
 };
 
 class l2_cache_config : public cache_config {
@@ -818,9 +950,10 @@ class tag_array {
   ~tag_array();
 
   enum cache_request_status probe(new_addr_type addr, unsigned &idx,
-                                  mem_fetch *mf, bool probe_mode = false) const;
+                                  mem_fetch *mf, bool is_write,
+                                  bool probe_mode = false) const;
   enum cache_request_status probe(new_addr_type addr, unsigned &idx,
-                                  mem_access_sector_mask_t mask,
+                                  mem_access_sector_mask_t mask, bool is_write,
                                   bool probe_mode = false,
                                   mem_fetch *mf = NULL) const;
   enum cache_request_status access(new_addr_type addr, unsigned time,
@@ -829,9 +962,10 @@ class tag_array {
                                    unsigned &idx, bool &wb,
                                    evicted_block_info &evicted, mem_fetch *mf);
 
-  void fill(new_addr_type addr, unsigned time, mem_fetch *mf);
+  void fill(new_addr_type addr, unsigned time, mem_fetch *mf, bool is_write);
   void fill(unsigned idx, unsigned time, mem_fetch *mf);
-  void fill(new_addr_type addr, unsigned time, mem_access_sector_mask_t mask);
+  void fill(new_addr_type addr, unsigned time, mem_access_sector_mask_t mask,
+            mem_access_byte_mask_t byte_mask, bool is_write);
 
   unsigned size() const { return m_config.get_num_lines(); }
   cache_block_t *get_block(unsigned idx) { return m_lines[idx]; }
@@ -849,6 +983,7 @@ class tag_array {
   void update_cache_parameters(cache_config &config);
   void add_pending_line(mem_fetch *mf);
   void remove_pending_line(mem_fetch *mf);
+  void inc_dirty() { m_dirty++; }
 
  protected:
   // This constructor is intended for use only from derived classes that wish to
@@ -869,6 +1004,7 @@ class tag_array {
                            // allocated but not filled
   unsigned m_res_fail;
   unsigned m_sector_miss;
+  unsigned m_dirty;
 
   // performance counters for calculating the amount of misses within a time
   // window
@@ -1071,20 +1207,26 @@ class cache_stats {
   void clear();
   // Clear AerialVision cache stats after each window
   void clear_pw();
-  void inc_stats(int access_type, int access_outcome);
+  void inc_stats(int access_type, int access_outcome,
+                 unsigned long long streamID);
   // Increment AerialVision cache stats
-  void inc_stats_pw(int access_type, int access_outcome);
-  void inc_fail_stats(int access_type, int fail_outcome);
+  void inc_stats_pw(int access_type, int access_outcome,
+                    unsigned long long streamID);
+  void inc_fail_stats(int access_type, int fail_outcome,
+                      unsigned long long streamID);
   enum cache_request_status select_stats_status(
       enum cache_request_status probe, enum cache_request_status access) const;
   unsigned long long &operator()(int access_type, int access_outcome,
-                                 bool fail_outcome);
+                                 bool fail_outcome,
+                                 unsigned long long streamID);
   unsigned long long operator()(int access_type, int access_outcome,
-                                bool fail_outcome) const;
+                                bool fail_outcome,
+                                unsigned long long streamID) const;
   cache_stats operator+(const cache_stats &cs);
   cache_stats &operator+=(const cache_stats &cs);
-  void print_stats(FILE *fout, const char *cache_name = "Cache_stats") const;
-  void print_fail_stats(FILE *fout,
+  void print_stats(FILE *fout, unsigned long long streamID,
+                   const char *cache_name = "Cache_stats") const;
+  void print_fail_stats(FILE *fout, unsigned long long streamID,
                         const char *cache_name = "Cache_fail_stats") const;
 
   unsigned long long get_stats(enum mem_access_type *access_type,
@@ -1102,10 +1244,14 @@ class cache_stats {
   bool check_valid(int type, int status) const;
   bool check_fail_valid(int type, int fail) const;
 
-  std::vector<std::vector<unsigned long long> > m_stats;
+  // CUDA streamID -> cache stats[NUM_MEM_ACCESS_TYPE]
+  std::map<unsigned long long, std::vector<std::vector<unsigned long long>>>
+      m_stats;
   // AerialVision cache stats (per-window)
-  std::vector<std::vector<unsigned long long> > m_stats_pw;
-  std::vector<std::vector<unsigned long long> > m_fail_stats;
+  std::map<unsigned long long, std::vector<std::vector<unsigned long long>>>
+      m_stats_pw;
+  std::map<unsigned long long, std::vector<std::vector<unsigned long long>>>
+      m_fail_stats;
 
   unsigned long long m_cache_port_available_cycles;
   unsigned long long m_cache_data_port_busy_cycles;
@@ -1135,11 +1281,14 @@ class baseline_cache : public cache_t {
  public:
   baseline_cache(const char *name, cache_config &config, int core_id,
                  int type_id, mem_fetch_interface *memport,
-                 enum mem_fetch_status status)
+                 enum mem_fetch_status status, enum cache_gpu_level level,
+                 gpgpu_sim *gpu)
       : m_config(config),
         m_tag_array(new tag_array(config, core_id, type_id)),
         m_mshrs(config.m_mshr_entries, config.m_mshr_max_merge),
-        m_bandwidth_management(config) {
+        m_bandwidth_management(config),
+        m_level(level),
+        m_gpu(gpu) {
     init(name, config, memport, status);
   }
 
@@ -1207,6 +1356,15 @@ class baseline_cache : public cache_t {
   bool fill_port_free() const {
     return m_bandwidth_management.fill_port_free();
   }
+  void inc_aggregated_stats(cache_request_status status,
+                            cache_request_status cache_status, mem_fetch *mf,
+                            enum cache_gpu_level level);
+  void inc_aggregated_fail_stats(cache_request_status status,
+                                 cache_request_status cache_status,
+                                 mem_fetch *mf, enum cache_gpu_level level);
+  void inc_aggregated_stats_pw(cache_request_status status,
+                               cache_request_status cache_status, mem_fetch *mf,
+                               enum cache_gpu_level level);
 
   // This is a gapping hole we are poking in the system to quickly handle
   // filling the cache on cudamemcopies. We don't care about anything other than
@@ -1214,7 +1372,8 @@ class baseline_cache : public cache_t {
   // something is read or written without doing anything else.
   void force_tag_access(new_addr_type addr, unsigned time,
                         mem_access_sector_mask_t mask) {
-    m_tag_array->fill(addr, time, mask);
+    mem_access_byte_mask_t byte_mask;
+    m_tag_array->fill(addr, time, mask, byte_mask, true);
   }
 
  protected:
@@ -1237,6 +1396,8 @@ class baseline_cache : public cache_t {
   std::list<mem_fetch *> m_miss_queue;
   enum mem_fetch_status m_miss_queue_status;
   mem_fetch_interface *m_memport;
+  cache_gpu_level m_level;
+  gpgpu_sim *m_gpu;
 
   struct extra_mf_fields {
     extra_mf_fields() { m_valid = false; }
@@ -1323,8 +1484,10 @@ class read_only_cache : public baseline_cache {
  public:
   read_only_cache(const char *name, cache_config &config, int core_id,
                   int type_id, mem_fetch_interface *memport,
-                  enum mem_fetch_status status)
-      : baseline_cache(name, config, core_id, type_id, memport, status) {}
+                  enum mem_fetch_status status, enum cache_gpu_level level,
+                  gpgpu_sim *gpu)
+      : baseline_cache(name, config, core_id, type_id, memport, status, level,
+                       gpu) {}
 
   /// Access cache for read_only_cache: returns RESERVATION_FAIL if request
   /// could not be accepted (for any reason)
@@ -1348,8 +1511,10 @@ class data_cache : public baseline_cache {
   data_cache(const char *name, cache_config &config, int core_id, int type_id,
              mem_fetch_interface *memport, mem_fetch_allocator *mfcreator,
              enum mem_fetch_status status, mem_access_type wr_alloc_type,
-             mem_access_type wrbk_type, class gpgpu_sim *gpu)
-      : baseline_cache(name, config, core_id, type_id, memport, status) {
+             mem_access_type wrbk_type, class gpgpu_sim *gpu,
+             enum cache_gpu_level level)
+      : baseline_cache(name, config, core_id, type_id, memport, status, level,
+                       gpu) {
     init(mfcreator);
     m_wr_alloc_type = wr_alloc_type;
     m_wrbk_type = wrbk_type;
@@ -1451,7 +1616,7 @@ class data_cache : public baseline_cache {
   /// Sends write request to lower level memory (write or writeback)
   void send_write_request(mem_fetch *mf, cache_event request, unsigned time,
                           std::list<cache_event> &events);
-
+  void update_m_readable(mem_fetch *mf, unsigned cache_index);
   // Member Function pointers - Set by configuration options
   // to the functions below each grouping
   /******* Write-hit configs *******/
@@ -1538,9 +1703,10 @@ class l1_cache : public data_cache {
  public:
   l1_cache(const char *name, cache_config &config, int core_id, int type_id,
            mem_fetch_interface *memport, mem_fetch_allocator *mfcreator,
-           enum mem_fetch_status status, class gpgpu_sim *gpu)
+           enum mem_fetch_status status, class gpgpu_sim *gpu,
+           enum cache_gpu_level level)
       : data_cache(name, config, core_id, type_id, memport, mfcreator, status,
-                   L1_WR_ALLOC_R, L1_WRBK_ACC, gpu) {}
+                   L1_WR_ALLOC_R, L1_WRBK_ACC, gpu, level) {}
 
   virtual ~l1_cache() {}
 
@@ -1563,9 +1729,10 @@ class l2_cache : public data_cache {
  public:
   l2_cache(const char *name, cache_config &config, int core_id, int type_id,
            mem_fetch_interface *memport, mem_fetch_allocator *mfcreator,
-           enum mem_fetch_status status, class gpgpu_sim *gpu)
+           enum mem_fetch_status status, class gpgpu_sim *gpu,
+           enum cache_gpu_level level)
       : data_cache(name, config, core_id, type_id, memport, mfcreator, status,
-                   L2_WR_ALLOC_R, L2_WRBK_ACC, gpu) {}
+                   L2_WR_ALLOC_R, L2_WRBK_ACC, gpu, level) {}
 
   virtual ~l2_cache() {}
 
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 165068879..b92494b43 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1,19 +1,22 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, George L. Yuan,
-// Ali Bakhoda, Andrew Turner, Ivan Sham
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, George L. Yuan,
+// Ali Bakhoda, Andrew Turner, Ivan Sham, Vijay Kandiah, Nikos Hardavellas,
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -78,7 +81,7 @@ class gpgpu_sim_wrapper {};
 #include <sstream>
 #include <string>
 
-#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+// #define MAX(a, b) (((a) > (b)) ? (a) : (b)) //redefined
 
 bool g_interactive_debugger_enabled = false;
 
@@ -96,9 +99,9 @@ tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 #include "mem_latency_stat.h"
 
 void power_config::reg_options(class OptionParser *opp) {
-  option_parser_register(opp, "-gpuwattch_xml_file", OPT_CSTR,
-                         &g_power_config_name, "GPUWattch XML file",
-                         "gpuwattch.xml");
+  option_parser_register(opp, "-accelwattch_xml_file", OPT_CSTR,
+                         &g_power_config_name, "AccelWattch XML file",
+                         "accelwattch_sass_sim.xml");
 
   option_parser_register(opp, "-power_simulation_enabled", OPT_BOOL,
                          &g_power_simulation_enabled,
@@ -108,6 +111,107 @@ void power_config::reg_options(class OptionParser *opp) {
                          &g_power_per_cycle_dump,
                          "Dump detailed power output each cycle", "0");
 
+  option_parser_register(opp, "-hw_perf_file_name", OPT_CSTR,
+                         &g_hw_perf_file_name,
+                         "Hardware Performance Statistics file", "hw_perf.csv");
+
+  option_parser_register(
+      opp, "-hw_perf_bench_name", OPT_CSTR, &g_hw_perf_bench_name,
+      "Kernel Name in Hardware Performance Statistics file", "");
+
+  option_parser_register(opp, "-power_simulation_mode", OPT_INT32,
+                         &g_power_simulation_mode,
+                         "Switch performance counter input for power "
+                         "simulation (0=Sim, 1=HW, 2=HW-Sim Hybrid)",
+                         "0");
+
+  option_parser_register(opp, "-dvfs_enabled", OPT_BOOL, &g_dvfs_enabled,
+                         "Turn on DVFS for power model", "0");
+  option_parser_register(opp, "-aggregate_power_stats", OPT_BOOL,
+                         &g_aggregate_power_stats,
+                         "Accumulate power across all kernels", "0");
+
+  // Accelwattch Hyrbid Configuration
+
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L1_RH", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L1_RH],
+      "Get L1 Read Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L1_RM", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L1_RM],
+      "Get L1 Read Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L1_WH", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L1_WH],
+      "Get L1 Write Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L1_WM", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L1_WM],
+      "Get L1 Write Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L2_RH", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L2_RH],
+      "Get L2 Read Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L2_RM", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L2_RM],
+      "Get L2 Read Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L2_WH", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L2_WH],
+      "Get L2 Write Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L2_WM", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L2_WM],
+      "Get L2 Write Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_CC_ACC", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_CC_ACC],
+      "Get Constant Cache Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_SHARED_ACC", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_SHRD_ACC],
+      "Get Shared Memory Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_DRAM_RD", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_DRAM_RD],
+                         "Get DRAM Reads for Accelwattch-Hybrid from Accel-Sim",
+                         "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_DRAM_WR", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_DRAM_WR],
+      "Get DRAM Writes for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_NOC", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_NOC],
+      "Get Interconnect Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_PIPE_DUTY", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_PIPE_DUTY],
+      "Get Pipeline Duty Cycle Acesses for Accelwattch-Hybrid from Accel-Sim",
+      "0");
+
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_NUM_SM_IDLE", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_NUM_SM_IDLE],
+      "Get Number of Idle SMs for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_CYCLES", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_CYCLES],
+      "Get Executed Cycles for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_VOLTAGE", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_VOLTAGE],
+      "Get Chip Voltage for Accelwattch-Hybrid from Accel-Sim", "0");
+
   // Output Data Formats
   option_parser_register(
       opp, "-power_trace_enabled", OPT_BOOL, &g_power_trace_enabled,
@@ -215,6 +319,9 @@ void memory_config::reg_options(class OptionParser *opp) {
       "elimnate_rw_turnaround i.e set tWTR and tRTW = 0", "0");
   option_parser_register(opp, "-icnt_flit_size", OPT_UINT32, &icnt_flit_size,
                          "icnt_flit_size", "32");
+  // SST mode activate
+  option_parser_register(opp, "-SST_mode", OPT_BOOL, &SST_mode, "SST mode",
+                         "0");
   m_address_mapping.addrdec_setoption(opp);
 }
 
@@ -249,6 +356,8 @@ void shader_core_config::reg_options(class OptionParser *opp) {
                          " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
                          "alloc>,<mshr>:<N>:<merge>,<mq> | none}",
                          "none");
+  option_parser_register(opp, "-gpgpu_l1_cache_write_ratio", OPT_UINT32,
+                         &m_L1D_config.m_wr_percent, "L1D write ratio", "0");
   option_parser_register(opp, "-gpgpu_l1_banks", OPT_UINT32,
                          &m_L1D_config.l1_banks, "The number of L1 cache banks",
                          "1");
@@ -304,7 +413,7 @@ void shader_core_config::reg_options(class OptionParser *opp) {
                          "gpgpu_ignore_resources_limitation (default 0)", "0");
   option_parser_register(
       opp, "-gpgpu_shader_cta", OPT_UINT32, &max_cta_per_core,
-      "Maximum number of concurrent CTAs in shader (default 8)", "8");
+      "Maximum number of concurrent CTAs in shader (default 32)", "32");
   option_parser_register(
       opp, "-gpgpu_num_cta_barriers", OPT_UINT32, &max_barriers_per_cta,
       "Maximum number of named barriers per CTA (default 16)", "16");
@@ -326,7 +435,14 @@ void shader_core_config::reg_options(class OptionParser *opp) {
   option_parser_register(
       opp, "-gpgpu_shmem_size", OPT_UINT32, &gpgpu_shmem_size,
       "Size of shared memory per shader core (default 16kB)", "16384");
-  option_parser_register(opp, "-gpgpu_adaptive_cache_config", OPT_UINT32,
+  option_parser_register(opp, "-gpgpu_shmem_option", OPT_CSTR,
+                         &gpgpu_shmem_option,
+                         "Option list of shared memory sizes", "0");
+  option_parser_register(
+      opp, "-gpgpu_unified_l1d_size", OPT_UINT32,
+      &m_L1D_config.m_unified_cache_size,
+      "Size of unified data cache(L1D + shared memory) in KB", "0");
+  option_parser_register(opp, "-gpgpu_adaptive_cache_config", OPT_BOOL,
                          &adaptive_cache_config, "adaptive_cache_config", "0");
   option_parser_register(
       opp, "-gpgpu_shmem_sizeDefault", OPT_UINT32, &gpgpu_shmem_sizeDefault,
@@ -488,26 +604,26 @@ void shader_core_config::reg_options(class OptionParser *opp) {
       "ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_"
       "INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE",
       "1,1,1,1,1,1,1,1,1,1,1,1,1");
-  option_parser_register(opp, "-gpgpu_tensor_core_avail", OPT_INT32,
+  option_parser_register(opp, "-gpgpu_tensor_core_avail", OPT_UINT32,
                          &gpgpu_tensor_core_avail,
                          "Tensor Core Available (default=0)", "0");
-  option_parser_register(opp, "-gpgpu_num_sp_units", OPT_INT32,
+  option_parser_register(opp, "-gpgpu_num_sp_units", OPT_UINT32,
                          &gpgpu_num_sp_units, "Number of SP units (default=1)",
                          "1");
-  option_parser_register(opp, "-gpgpu_num_dp_units", OPT_INT32,
+  option_parser_register(opp, "-gpgpu_num_dp_units", OPT_UINT32,
                          &gpgpu_num_dp_units, "Number of DP units (default=0)",
                          "0");
-  option_parser_register(opp, "-gpgpu_num_int_units", OPT_INT32,
+  option_parser_register(opp, "-gpgpu_num_int_units", OPT_UINT32,
                          &gpgpu_num_int_units,
                          "Number of INT units (default=0)", "0");
-  option_parser_register(opp, "-gpgpu_num_sfu_units", OPT_INT32,
+  option_parser_register(opp, "-gpgpu_num_sfu_units", OPT_UINT32,
                          &gpgpu_num_sfu_units, "Number of SF units (default=1)",
                          "1");
-  option_parser_register(opp, "-gpgpu_num_tensor_core_units", OPT_INT32,
+  option_parser_register(opp, "-gpgpu_num_tensor_core_units", OPT_UINT32,
                          &gpgpu_num_tensor_core_units,
                          "Number of tensor_core units (default=1)", "0");
   option_parser_register(
-      opp, "-gpgpu_num_mem_units", OPT_INT32, &gpgpu_num_mem_units,
+      opp, "-gpgpu_num_mem_units", OPT_UINT32, &gpgpu_num_mem_units,
       "Number if ldst units (default=1) WARNING: not hooked up to anything",
       "1");
   option_parser_register(
@@ -603,7 +719,9 @@ void gpgpu_sim_config::reg_options(option_parser_t opp) {
                          "500.0:2000.0:2000.0:2000.0");
   option_parser_register(
       opp, "-gpgpu_max_concurrent_kernel", OPT_INT32, &max_concurrent_kernel,
-      "maximum kernels that can run concurrently on GPU", "8");
+      "maximum kernels that can run concurrently on GPU, set this value "
+      "according to max resident grids for your compute capability",
+      "32");
   option_parser_register(
       opp, "-gpgpu_cflog_interval", OPT_INT32, &gpgpu_cflog_interval,
       "Interval between each snapshot in control flow logger", "0");
@@ -673,6 +791,22 @@ void increment_x_then_y_then_z(dim3 &i, const dim3 &bound) {
 }
 
 void gpgpu_sim::launch(kernel_info_t *kinfo) {
+  unsigned kernelID = kinfo->get_uid();
+  unsigned long long streamID = kinfo->get_streamID();
+
+  kernel_time_t kernel_time = {gpu_tot_sim_cycle + gpu_sim_cycle, 0};
+  if (gpu_kernel_time.find(streamID) == gpu_kernel_time.end()) {
+    std::map<unsigned, kernel_time_t> new_val;
+    new_val.insert(std::pair<unsigned, kernel_time_t>(kernelID, kernel_time));
+    gpu_kernel_time.insert(
+        std::pair<unsigned long long, std::map<unsigned, kernel_time_t>>(
+            streamID, new_val));
+  } else {
+    gpu_kernel_time.at(streamID).insert(
+        std::pair<unsigned, kernel_time_t>(kernelID, kernel_time));
+    ////////// assume same kernel ID do not appear more than once
+  }
+
   unsigned cta_size = kinfo->threads_per_cta();
   if (cta_size > m_shader_config->n_thread_per_shader) {
     printf(
@@ -778,7 +912,10 @@ kernel_info_t *gpgpu_sim::select_kernel() {
 }
 
 unsigned gpgpu_sim::finished_kernel() {
-  if (m_finished_kernel.empty()) return 0;
+  if (m_finished_kernel.empty()) {
+    last_streamID = -1;
+    return 0;
+  }
   unsigned result = m_finished_kernel.front();
   m_finished_kernel.pop_front();
   return result;
@@ -786,6 +923,11 @@ unsigned gpgpu_sim::finished_kernel() {
 
 void gpgpu_sim::set_kernel_done(kernel_info_t *kernel) {
   unsigned uid = kernel->get_uid();
+  last_uid = uid;
+  unsigned long long streamID = kernel->get_streamID();
+  last_streamID = streamID;
+  gpu_kernel_time.at(streamID).at(uid).end_cycle =
+      gpu_tot_sim_cycle + gpu_sim_cycle;
   m_finished_kernel.push_back(uid);
   std::vector<kernel_info_t *>::iterator k;
   for (k = m_running_kernels.begin(); k != m_running_kernels.end(); k++) {
@@ -816,6 +958,16 @@ void exec_gpgpu_sim::createSIMTCluster() {
                                    m_shader_stats, m_memory_stats);
 }
 
+// SST get its own simt_cluster
+void sst_gpgpu_sim::createSIMTCluster() {
+  m_cluster = new simt_core_cluster *[m_shader_config->n_simt_clusters];
+  for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
+    m_cluster[i] =
+        new sst_simt_core_cluster(this, i, m_shader_config, m_memory_config,
+                                  m_shader_stats, m_memory_stats);
+  SST_gpgpu_reply_buffer.resize(m_shader_config->n_simt_clusters);
+}
+
 gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
     : gpgpu_t(config, ctx), m_config(config) {
   gpgpu_ctx = ctx;
@@ -825,8 +977,9 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
   ptx_file_line_stats_create_exposed_latency_tracker(m_config.num_shader());
 
 #ifdef GPGPUSIM_POWER_MODEL
-  m_gpgpusim_wrapper = new gpgpu_sim_wrapper(config.g_power_simulation_enabled,
-                                             config.g_power_config_name);
+  m_gpgpusim_wrapper = new gpgpu_sim_wrapper(
+      config.g_power_simulation_enabled, config.g_power_config_name,
+      config.g_power_simulation_mode, config.g_dvfs_enabled);
 #endif
 
   m_shader_stats = new shader_core_stats(m_shader_config);
@@ -855,27 +1008,33 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
   gpu_tot_sim_cycle_parition_util = 0;
   partiton_replys_in_parallel = 0;
   partiton_replys_in_parallel_total = 0;
+  last_streamID = -1;
 
-  m_memory_partition_unit =
-      new memory_partition_unit *[m_memory_config->m_n_mem];
-  m_memory_sub_partition =
-      new memory_sub_partition *[m_memory_config->m_n_mem_sub_partition];
-  for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
-    m_memory_partition_unit[i] =
-        new memory_partition_unit(i, m_memory_config, m_memory_stats, this);
-    for (unsigned p = 0;
-         p < m_memory_config->m_n_sub_partition_per_memory_channel; p++) {
-      unsigned submpid =
-          i * m_memory_config->m_n_sub_partition_per_memory_channel + p;
-      m_memory_sub_partition[submpid] =
-          m_memory_partition_unit[i]->get_sub_partition(p);
-    }
-  }
+  gpu_kernel_time.clear();
 
-  icnt_wrapper_init();
-  icnt_create(m_shader_config->n_simt_clusters,
-              m_memory_config->m_n_mem_sub_partition);
+  // TODO: somehow move this logic to the sst_gpgpu_sim constructor?
+  if (!m_config.is_SST_mode()) {
+    // Init memory if not in SST mode
+    m_memory_partition_unit =
+        new memory_partition_unit *[m_memory_config->m_n_mem];
+    m_memory_sub_partition =
+        new memory_sub_partition *[m_memory_config->m_n_mem_sub_partition];
+    for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
+      m_memory_partition_unit[i] =
+          new memory_partition_unit(i, m_memory_config, m_memory_stats, this);
+      for (unsigned p = 0;
+           p < m_memory_config->m_n_sub_partition_per_memory_channel; p++) {
+        unsigned submpid =
+            i * m_memory_config->m_n_sub_partition_per_memory_channel + p;
+        m_memory_sub_partition[submpid] =
+            m_memory_partition_unit[i]->get_sub_partition(p);
+      }
+    }
 
+    icnt_wrapper_init();
+    icnt_create(m_shader_config->n_simt_clusters,
+                m_memory_config->m_n_mem_sub_partition);
+  }
   time_vector_create(NUM_MEM_REQ_STAT);
   fprintf(stdout,
           "GPGPU-Sim uArch: performance model initialization complete.\n");
@@ -894,6 +1053,22 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
   m_functional_sim_kernel = NULL;
 }
 
+void sst_gpgpu_sim::SST_receive_mem_reply(unsigned core_id, void *mem_req) {
+  assert(core_id < m_shader_config->n_simt_clusters);
+  mem_fetch *mf = (mem_fetch *)mem_req;
+
+  (SST_gpgpu_reply_buffer[core_id]).push_back(mf);
+}
+
+mem_fetch *sst_gpgpu_sim::SST_pop_mem_reply(unsigned core_id) {
+  if (SST_gpgpu_reply_buffer[core_id].size() > 0) {
+    mem_fetch *temp = SST_gpgpu_reply_buffer[core_id].front();
+    SST_gpgpu_reply_buffer[core_id].pop_front();
+    return temp;
+  } else
+    return NULL;
+}
+
 int gpgpu_sim::shared_mem_size() const {
   return m_shader_config->gpgpu_shmem_size;
 }
@@ -989,6 +1164,26 @@ bool gpgpu_sim::active() {
   return false;
 }
 
+bool sst_gpgpu_sim::active() {
+  if (m_config.gpu_max_cycle_opt &&
+      (gpu_tot_sim_cycle + gpu_sim_cycle) >= m_config.gpu_max_cycle_opt)
+    return false;
+  if (m_config.gpu_max_insn_opt &&
+      (gpu_tot_sim_insn + gpu_sim_insn) >= m_config.gpu_max_insn_opt)
+    return false;
+  if (m_config.gpu_max_cta_opt &&
+      (gpu_tot_issued_cta >= m_config.gpu_max_cta_opt))
+    return false;
+  if (m_config.gpu_max_completed_cta_opt &&
+      (gpu_completed_cta >= m_config.gpu_max_completed_cta_opt))
+    return false;
+  if (m_config.gpu_deadlock_detect && gpu_deadlock) return false;
+  for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
+    if (m_cluster[i]->get_not_completed() > 0) return true;
+  if (get_more_cta_left()) return true;
+  return false;
+}
+
 void gpgpu_sim::init() {
   // run a CUDA grid on the GPU microarchitecture simulator
   gpu_sim_cycle = 0;
@@ -1001,6 +1196,14 @@ void gpgpu_sim::init() {
   partiton_reqs_in_parallel_util = 0;
   gpu_sim_cycle_parition_util = 0;
 
+// McPAT initialization function. Called on first launch of GPU
+#ifdef GPGPUSIM_POWER_MODEL
+  if (m_config.g_power_simulation_enabled) {
+    init_mcpat(m_config, m_gpgpusim_wrapper, m_config.gpu_stat_sample_freq,
+               gpu_tot_sim_insn, gpu_sim_insn);
+  }
+#endif
+
   reinit_clock_domains();
   gpgpu_ctx->func_sim->set_param_gpgpu_num_shaders(m_config.num_shader());
   for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
@@ -1026,14 +1229,6 @@ void gpgpu_sim::init() {
   }
 
   if (g_network_mode) icnt_init();
-
-    // McPAT initialization function. Called on first launch of GPU
-#ifdef GPGPUSIM_POWER_MODEL
-  if (m_config.g_power_simulation_enabled) {
-    init_mcpat(m_config, m_gpgpusim_wrapper, m_config.gpu_stat_sample_freq,
-               gpu_tot_sim_insn, gpu_sim_insn);
-  }
-#endif
 }
 
 void gpgpu_sim::update_stats() {
@@ -1058,9 +1253,13 @@ void gpgpu_sim::update_stats() {
   gpu_occupancy = occupancy_stats();
 }
 
-void gpgpu_sim::print_stats() {
+PowerscalingCoefficients *gpgpu_sim::get_scaling_coeffs() {
+  return m_gpgpusim_wrapper->get_scaling_coeffs();
+}
+
+void gpgpu_sim::print_stats(unsigned long long streamID) {
   gpgpu_ctx->stats->ptx_file_line_stats_write_file();
-  gpu_print_stat();
+  gpu_print_stat(streamID);
 
   if (g_network_mode) {
     printf(
@@ -1137,6 +1336,18 @@ std::string gpgpu_sim::executed_kernel_info_string() {
 
   return statout.str();
 }
+
+std::string gpgpu_sim::executed_kernel_name() {
+  std::stringstream statout;
+  if (m_executed_kernel_names.size() == 1)
+    statout << m_executed_kernel_names[0];
+  else {
+    for (unsigned int k = 0; k < m_executed_kernel_names.size(); k++) {
+      statout << m_executed_kernel_names[k] << " ";
+    }
+  }
+  return statout.str();
+}
 void gpgpu_sim::set_cache_config(std::string kernel_name,
                                  FuncCache cacheConfig) {
   m_special_cache_config[kernel_name] = cacheConfig;
@@ -1231,12 +1442,15 @@ void gpgpu_sim::clear_executed_kernel_info() {
   m_executed_kernel_names.clear();
   m_executed_kernel_uids.clear();
 }
-void gpgpu_sim::gpu_print_stat() {
+
+void gpgpu_sim::gpu_print_stat(unsigned long long streamID) {
   FILE *statfout = stdout;
 
   std::string kernel_info_str = executed_kernel_info_string();
   fprintf(statfout, "%s", kernel_info_str.c_str());
 
+  printf("kernel_stream_id = %llu\n", streamID);
+
   printf("gpu_sim_cycle = %lld\n", gpu_sim_cycle);
   printf("gpu_sim_insn = %lld\n", gpu_sim_insn);
   printf("gpu_ipc = %12.4f\n", (float)gpu_sim_insn / gpu_sim_cycle);
@@ -1283,13 +1497,13 @@ void gpgpu_sim::gpu_print_stat() {
   // %lld\n", partiton_replys_in_parallel_total );
   printf("L2_BW  = %12.4f GB/Sec\n",
          ((float)(partiton_replys_in_parallel * 32) /
-          (gpu_sim_cycle * m_config.icnt_period)) /
+          (gpu_sim_cycle * m_config.core_period)) /
              1000000000);
   printf("L2_BW_total  = %12.4f GB/Sec\n",
          ((float)((partiton_replys_in_parallel +
                    partiton_replys_in_parallel_total) *
                   32) /
-          ((gpu_tot_sim_cycle + gpu_sim_cycle) * m_config.icnt_period)) /
+          ((gpu_tot_sim_cycle + gpu_sim_cycle) * m_config.core_period)) /
              1000000000);
 
   time_t curr_time;
@@ -1308,18 +1522,32 @@ void gpgpu_sim::gpu_print_stat() {
     m_cluster[i]->get_cache_stats(core_cache_stats);
   }
   printf("\nTotal_core_cache_stats:\n");
-  core_cache_stats.print_stats(stdout, "Total_core_cache_stats_breakdown");
+  core_cache_stats.print_stats(stdout, streamID,
+                               "Total_core_cache_stats_breakdown");
   printf("\nTotal_core_cache_fail_stats:\n");
-  core_cache_stats.print_fail_stats(stdout,
+  core_cache_stats.print_fail_stats(stdout, streamID,
                                     "Total_core_cache_fail_stats_breakdown");
   shader_print_scheduler_stat(stdout, false);
 
   m_shader_stats->print(stdout);
 #ifdef GPGPUSIM_POWER_MODEL
   if (m_config.g_power_simulation_enabled) {
+    if (m_config.g_power_simulation_mode > 0) {
+      // if(!m_config.g_aggregate_power_stats)
+      mcpat_reset_perf_count(m_gpgpusim_wrapper);
+      calculate_hw_mcpat(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
+                         m_power_stats, m_config.gpu_stat_sample_freq,
+                         gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
+                         gpu_sim_insn, m_config.g_power_simulation_mode,
+                         m_config.g_dvfs_enabled, m_config.g_hw_perf_file_name,
+                         m_config.g_hw_perf_bench_name, executed_kernel_name(),
+                         m_config.accelwattch_hybrid_configuration,
+                         m_config.g_aggregate_power_stats);
+    }
     m_gpgpusim_wrapper->print_power_kernel_stats(
         gpu_sim_cycle, gpu_tot_sim_cycle, gpu_tot_sim_insn + gpu_sim_insn,
         kernel_info_str, true);
+    // if(!m_config.g_aggregate_power_stats)
     mcpat_reset_perf_count(m_gpgpusim_wrapper);
   }
 #endif
@@ -1365,9 +1593,10 @@ void gpgpu_sim::gpu_print_stat() {
       printf("L2_total_cache_reservation_fails = %llu\n",
              total_l2_css.res_fails);
       printf("L2_total_cache_breakdown:\n");
-      l2_stats.print_stats(stdout, "L2_cache_stats_breakdown");
+      l2_stats.print_stats(stdout, streamID, "L2_cache_stats_breakdown");
       printf("L2_total_cache_reservation_fail_breakdown:\n");
-      l2_stats.print_fail_stats(stdout, "L2_cache_stats_fail_breakdown");
+      l2_stats.print_fail_stats(stdout, streamID,
+                                "L2_cache_stats_fail_breakdown");
       total_l2_css.print_port_stats(stdout, "L2_cache");
     }
   }
@@ -1514,9 +1743,9 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k,
 
     SHADER_DPRINTF(LIVENESS,
                    "GPGPU-Sim uArch: Occupied %u threads, %u shared mem, %u "
-                   "registers, %u ctas\n",
+                   "registers, %u ctas, on shader %d\n",
                    m_occupied_n_threads, m_occupied_shmem, m_occupied_regs,
-                   m_occupied_ctas);
+                   m_occupied_ctas, m_sid);
   }
 
   return true;
@@ -1682,9 +1911,10 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   shader_CTA_count_log(m_sid, 1);
   SHADER_DPRINTF(LIVENESS,
                  "GPGPU-Sim uArch: cta:%2u, start_tid:%4u, end_tid:%4u, "
-                 "initialized @(%lld,%lld)\n",
+                 "initialized @(%lld,%lld), kernel_uid:%u, kernel_name:%s\n",
                  free_cta_hw_id, start_thread, end_thread, m_gpu->gpu_sim_cycle,
-                 m_gpu->gpu_tot_sim_cycle);
+                 m_gpu->gpu_tot_sim_cycle, kernel.get_uid(),
+                 kernel.get_name().c_str());
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////
@@ -1787,6 +2017,7 @@ void gpgpu_sim::cycle() {
           m_power_stats->pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i],
+          m_power_stats->pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_req[CURRENT_STAT_IDX][i]);
     }
   }
@@ -1808,8 +2039,10 @@ void gpgpu_sim::cycle() {
         if (mf) partiton_reqs_in_parallel_per_cycle++;
       }
       m_memory_sub_partition[i]->cache_cycle(gpu_sim_cycle + gpu_tot_sim_cycle);
-      m_memory_sub_partition[i]->accumulate_L2cache_stats(
-          m_power_stats->pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX]);
+      if (m_config.g_power_simulation_enabled) {
+        m_memory_sub_partition[i]->accumulate_L2cache_stats(
+            m_power_stats->pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX]);
+      }
     }
   }
   partiton_reqs_in_parallel += partiton_reqs_in_parallel_per_cycle;
@@ -1830,15 +2063,17 @@ void gpgpu_sim::cycle() {
         m_cluster[i]->core_cycle();
         *active_sms += m_cluster[i]->get_n_active_sms();
       }
-      // Update core icnt/cache stats for GPUWattch
-      m_cluster[i]->get_icnt_stats(
-          m_power_stats->pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i],
-          m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]);
-      m_cluster[i]->get_cache_stats(
-          m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX]);
-      m_cluster[i]->get_current_occupancy(
-          gpu_occupancy.aggregate_warp_slot_filled,
-          gpu_occupancy.aggregate_theoretical_warp_slots);
+      // Update core icnt/cache stats for AccelWattch
+      if (m_config.g_power_simulation_enabled) {
+        m_cluster[i]->get_icnt_stats(
+            m_power_stats->pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i],
+            m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]);
+        m_cluster[i]->get_cache_stats(
+            m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX]);
+        m_cluster[i]->get_current_occupancy(
+            gpu_occupancy.aggregate_warp_slot_filled,
+            gpu_occupancy.aggregate_theoretical_warp_slots);
+      }
     }
     float temp = 0;
     for (unsigned i = 0; i < m_shader_config->num_shader(); i++) {
@@ -1860,10 +2095,12 @@ void gpgpu_sim::cycle() {
       // McPAT main cycle (interface with McPAT)
 #ifdef GPGPUSIM_POWER_MODEL
     if (m_config.g_power_simulation_enabled) {
-      mcpat_cycle(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
-                  m_power_stats, m_config.gpu_stat_sample_freq,
-                  gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
-                  gpu_sim_insn);
+      if (m_config.g_power_simulation_mode == 0) {
+        mcpat_cycle(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
+                    m_power_stats, m_config.gpu_stat_sample_freq,
+                    gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
+                    gpu_sim_insn, m_config.g_dvfs_enabled);
+      }
     }
 #endif
 
@@ -1924,7 +2161,7 @@ void gpgpu_sim::cycle() {
           m_cluster[i]->get_current_occupancy(active, total);
         }
         DPRINTFG(LIVENESS,
-                 "uArch: inst.: %lld (ipc=%4.1f, occ=%0.4f\% [%llu / %llu]) "
+                 "uArch: inst.: %lld (ipc=%4.1f, occ=%0.4f%% [%llu / %llu]) "
                  "sim_rate=%u (inst/sec) elapsed = %u:%u:%02u:%02u / %s",
                  gpu_tot_sim_insn + gpu_sim_insn,
                  (double)gpu_sim_insn / (double)gpu_sim_cycle,
@@ -1972,6 +2209,11 @@ void gpgpu_sim::cycle() {
   }
 }
 
+void sst_gpgpu_sim::cycle() {
+  SST_cycle();
+  return;
+}
+
 void shader_core_ctx::dump_warp_state(FILE *fout) const {
   fprintf(fout, "\n");
   fprintf(fout, "per warp functional simulation status:\n");
@@ -2051,3 +2293,110 @@ const shader_core_config *gpgpu_sim::getShaderCoreConfig() {
 const memory_config *gpgpu_sim::getMemoryConfig() { return m_memory_config; }
 
 simt_core_cluster *gpgpu_sim::getSIMTCluster() { return *m_cluster; }
+
+void sst_gpgpu_sim::SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) {
+  if (m_shader_config->n_simt_clusters != sst_numcores) {
+    assert(
+        "\nSST core is not equal the GPGPU-sim cores. Open gpgpu-sim.config "
+        "file and ensure n_simt_clusters"
+        "is the same as SST gpu cores.\n" &&
+        0);
+  } else {
+    printf("\nSST GPU core is equal the GPGPU-sim cores = %d\n", sst_numcores);
+  }
+}
+
+void sst_gpgpu_sim::SST_cycle() {
+  // shader core loading (pop from ICNT into core) follows CORE clock
+  for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
+    static_cast<sst_simt_core_cluster *>(m_cluster[i])->icnt_cycle_SST();
+
+  // L1 cache + shader core pipeline stages
+  m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].clear();
+  for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) {
+    if (m_cluster[i]->get_not_completed() || get_more_cta_left()) {
+      m_cluster[i]->core_cycle();
+      *active_sms += m_cluster[i]->get_n_active_sms();
+    }
+    // Update core icnt/cache stats for GPUWattch
+    m_cluster[i]->get_icnt_stats(
+        m_power_stats->pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i],
+        m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]);
+    m_cluster[i]->get_cache_stats(
+        m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX]);
+  }
+  float temp = 0;
+  for (unsigned i = 0; i < m_shader_config->num_shader(); i++) {
+    temp += m_shader_stats->m_pipeline_duty_cycle[i];
+  }
+  temp = temp / m_shader_config->num_shader();
+  *average_pipeline_duty_cycle = ((*average_pipeline_duty_cycle) + temp);
+  // cout<<"Average pipeline duty cycle: "<<*average_pipeline_duty_cycle<<endl;
+
+  if (g_single_step && ((gpu_sim_cycle + gpu_tot_sim_cycle) >= g_single_step)) {
+    asm("int $03");
+  }
+  gpu_sim_cycle++;
+  if (g_interactive_debugger_enabled) gpgpu_debug();
+
+    // McPAT main cycle (interface with McPAT)
+#ifdef GPGPUSIM_POWER_MODEL
+  if (m_config.g_power_simulation_enabled) {
+    mcpat_cycle(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
+                m_power_stats, m_config.gpu_stat_sample_freq, gpu_tot_sim_cycle,
+                gpu_sim_cycle, gpu_tot_sim_insn, gpu_sim_insn,
+                m_config.g_dvfs_enabled);
+  }
+#endif
+
+  issue_block2core();
+
+  if (!(gpu_sim_cycle % m_config.gpu_stat_sample_freq)) {
+    time_t days, hrs, minutes, sec;
+    time_t curr_time;
+    time(&curr_time);
+    unsigned long long elapsed_time =
+        MAX(curr_time - gpgpu_ctx->the_gpgpusim->g_simulation_starttime, 1);
+    if ((elapsed_time - last_liveness_message_time) >=
+        m_config.liveness_message_freq) {
+      days = elapsed_time / (3600 * 24);
+      hrs = elapsed_time / 3600 - 24 * days;
+      minutes = elapsed_time / 60 - 60 * (hrs + 24 * days);
+      sec = elapsed_time - 60 * (minutes + 60 * (hrs + 24 * days));
+
+      last_liveness_message_time = elapsed_time;
+    }
+    visualizer_printstat();
+    m_memory_stats->memlatstat_lat_pw();
+    if (m_config.gpgpu_runtime_stat && (m_config.gpu_runtime_stat_flag != 0)) {
+      if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_BW_STAT) {
+        for (unsigned i = 0; i < m_memory_config->m_n_mem; i++)
+          m_memory_partition_unit[i]->print_stat(stdout);
+        printf("maxmrqlatency = %d \n", m_memory_stats->max_mrq_latency);
+        printf("maxmflatency = %d \n", m_memory_stats->max_mf_latency);
+      }
+      if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_SHD_INFO)
+        shader_print_runtime_stat(stdout);
+      if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_L1MISS)
+        shader_print_l1_miss_stat(stdout);
+      if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_SCHED)
+        shader_print_scheduler_stat(stdout, false);
+    }
+  }
+
+  if (!(gpu_sim_cycle % 20000)) {
+    // deadlock detection
+    if (m_config.gpu_deadlock_detect && gpu_sim_insn == last_gpu_sim_insn) {
+      gpu_deadlock = true;
+    } else {
+      last_gpu_sim_insn = gpu_sim_insn;
+    }
+  }
+  try_snap_shot(gpu_sim_cycle);
+  spill_log_to_file(stdout, 0, gpu_sim_cycle);
+
+#if (CUDART_VERSION >= 5000)
+  // launch device kernel
+  gpgpu_ctx->device_runtime->launch_one_device_kernel();
+#endif
+}
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index 2e6820d82..d0c2a1763 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -1,18 +1,21 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah,
+// Nikos Hardavellas Mahmoud Khairy, Junrui Pan, Timothy G. Rogers The
+// University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -66,8 +69,63 @@ class gpgpu_context;
 
 extern tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 
+// SST communication functions
+/**
+ * @brief Check if SST requests buffer is full
+ *
+ * @param core_id
+ * @return true
+ * @return false
+ */
+extern bool is_SST_buffer_full(unsigned core_id);
+
+/**
+ * @brief Send loads to SST memory backend
+ *
+ * @param core_id
+ * @param address
+ * @param size
+ * @param mem_req
+ */
+extern void send_read_request_SST(unsigned core_id, uint64_t address,
+                                  size_t size, void *mem_req);
+
+/**
+ * @brief Send stores to SST memory backend
+ *
+ * @param core_id
+ * @param address
+ * @param size
+ * @param mem_req
+ */
+extern void send_write_request_SST(unsigned core_id, uint64_t address,
+                                   size_t size, void *mem_req);
+
 enum dram_ctrl_t { DRAM_FIFO = 0, DRAM_FRFCFS = 1 };
 
+enum hw_perf_t {
+  HW_BENCH_NAME = 0,
+  HW_KERNEL_NAME,
+  HW_L1_RH,
+  HW_L1_RM,
+  HW_L1_WH,
+  HW_L1_WM,
+  HW_CC_ACC,
+  HW_SHRD_ACC,
+  HW_DRAM_RD,
+  HW_DRAM_WR,
+  HW_L2_RH,
+  HW_L2_RM,
+  HW_L2_WH,
+  HW_L2_WM,
+  HW_NOC,
+  HW_PIPE_DUTY,
+  HW_NUM_SM_IDLE,
+  HW_CYCLES,
+  HW_VOLTAGE,
+  HW_TOTAL_STATS
+};
+
 struct power_config {
   power_config() { m_valid = true; }
   void init() {
@@ -82,7 +140,8 @@ struct power_config {
       s++;
     }
     char buf1[1024];
-    snprintf(buf1, 1024, "gpgpusim_power_report__%s.log", date);
+    // snprintf(buf1, 1024, "accelwattch_power_report__%s.log", date);
+    snprintf(buf1, 1024, "accelwattch_power_report.log");
     g_power_filename = strdup(buf1);
     char buf2[1024];
     snprintf(buf2, 1024, "gpgpusim_power_trace_report__%s.log.gz", date);
@@ -94,6 +153,9 @@ struct power_config {
     snprintf(buf4, 1024, "gpgpusim_steady_state_tracking_report__%s.log.gz",
              date);
     g_steady_state_tracking_filename = strdup(buf4);
+    // for(int i =0; i< hw_perf_t::HW_TOTAL_STATS; i++){
+    //   accelwattch_hybrid_configuration[i] = 0;
+    // }
 
     if (g_steady_power_levels_enabled) {
       sscanf(gpu_steady_state_definition, "%lf:%lf",
@@ -102,9 +164,9 @@ struct power_config {
 
     // NOTE: After changing the nonlinear model to only scaling idle core,
     // NOTE: The min_inc_per_active_sm is not used any more
-    if (g_use_nonlinear_model)
-      sscanf(gpu_nonlinear_model_config, "%lf:%lf", &gpu_idle_core_power,
-             &gpu_min_inc_per_active_sm);
+    // if (g_use_nonlinear_model)
+    //   sscanf(gpu_nonlinear_model_config, "%lf:%lf", &gpu_idle_core_power,
+    //          &gpu_min_inc_per_active_sm);
   }
   void reg_options(class OptionParser *opp);
 
@@ -125,6 +187,13 @@ struct power_config {
   double gpu_steady_power_deviation;
   double gpu_steady_min_period;
 
+  char *g_hw_perf_file_name;
+  char *g_hw_perf_bench_name;
+  int g_power_simulation_mode;
+  bool g_dvfs_enabled;
+  bool g_aggregate_power_stats;
+  bool accelwattch_hybrid_configuration[hw_perf_t::HW_TOTAL_STATS];
+
   // Nonlinear power model
   bool g_use_nonlinear_model;
   char *gpu_nonlinear_model_config;
@@ -237,6 +306,14 @@ class memory_config {
   }
   void reg_options(class OptionParser *opp);
 
+  /**
+   * @brief Check if the config script is in SST mode
+   *
+   * @return true
+   * @return false
+   */
+  bool is_SST_mode() const { return SST_mode; }
+
   bool m_valid;
   mutable l2_cache_config m_L2_config;
   bool m_L2_texure_only;
@@ -314,7 +391,7 @@ class memory_config {
   unsigned write_low_watermark;
   bool m_perf_sim_memcpy;
   bool simple_dram_model;
-
+  bool SST_mode;
   gpgpu_context *gpgpu_ctx;
 };
 
@@ -357,10 +434,19 @@ class gpgpu_sim_config : public power_config,
 
     m_valid = true;
   }
-
+  unsigned get_core_freq() const { return core_freq; }
   unsigned num_shader() const { return m_shader_config.num_shader(); }
   unsigned num_cluster() const { return m_shader_config.n_simt_clusters; }
   unsigned get_max_concurrent_kernel() const { return max_concurrent_kernel; }
+
+  /**
+   * @brief Check if we are in SST mode
+   *
+   * @return true
+   * @return false
+   */
+  bool is_SST_mode() const { return m_memory_config.SST_mode; }
+
   unsigned checkpoint_option;
 
   size_t stack_limit() const { return stack_size_limit; }
@@ -425,6 +511,7 @@ class gpgpu_sim_config : public power_config,
   unsigned long long liveness_message_freq;
 
   friend class gpgpu_sim;
+  friend class sst_gpgpu_sim;
 };
 
 struct occupancy_stats {
@@ -502,7 +589,7 @@ class gpgpu_sim : public gpgpu_t {
            (m_config.gpu_max_completed_cta_opt &&
             (gpu_completed_cta >= m_config.gpu_max_completed_cta_opt));
   }
-  void print_stats();
+  void print_stats(unsigned long long streamID);
   void update_stats();
   void deadlock_check();
   void inc_completed_cta() { gpu_completed_cta++; }
@@ -527,10 +614,11 @@ class gpgpu_sim : public gpgpu_t {
   bool kernel_more_cta_left(kernel_info_t *kernel) const;
   bool hit_max_cta_count() const;
   kernel_info_t *select_kernel();
+  PowerscalingCoefficients *get_scaling_coeffs();
   void decrement_kernel_latency();
 
   const gpgpu_sim_config &get_config() const { return m_config; }
-  void gpu_print_stat();
+  void gpu_print_stat(unsigned long long streamID);
   void dump_pipeline(int mask, int s, int m) const;
 
   void perf_memcpy_to_gpu(size_t dst_start_addr, size_t count);
@@ -562,10 +650,18 @@ class gpgpu_sim : public gpgpu_t {
   void hit_watchpoint(unsigned watchpoint_num, ptx_thread_info *thd,
                       const ptx_instruction *pI);
 
+  /**
+   * @brief Check if we are in SST mode
+   *
+   * @return true
+   * @return false
+   */
+  bool is_SST_mode() { return m_config.is_SST_mode(); }
+
   // backward pointer
   class gpgpu_context *gpgpu_ctx;
 
- private:
+ protected:
   // clocks
   void reinit_clock_domains(void);
   int next_clock_domain(void);
@@ -634,6 +730,7 @@ class gpgpu_sim : public gpgpu_t {
 
   std::string executed_kernel_info_string();  //< format the kernel information
                                               // into a string for stat printout
+  std::string executed_kernel_name();
   void clear_executed_kernel_info();  //< clear the kernel information after
                                       // stat printout
   virtual void createSIMTCluster() = 0;
@@ -646,6 +743,17 @@ class gpgpu_sim : public gpgpu_t {
   occupancy_stats gpu_occupancy;
   occupancy_stats gpu_tot_occupancy;
 
+  typedef struct {
+    unsigned long long start_cycle;
+    unsigned long long end_cycle;
+  } kernel_time_t;
+  std::map<unsigned long long, std::map<unsigned, kernel_time_t>>
+      gpu_kernel_time;
+  unsigned long long last_streamID;
+  unsigned long long last_uid;
+  cache_stats aggregated_l1_stats;
+  cache_stats aggregated_l2_stats;
+
   // performance counter for stalls due to congestion.
   unsigned int gpu_stall_dramfull;
   unsigned int gpu_stall_icnt2sh;
@@ -665,7 +773,7 @@ class gpgpu_sim : public gpgpu_t {
   void set_cache_config(std::string kernel_name);
 
   // Jin: functional simulation for CDP
- private:
+ protected:
   // set by stream operation every time a functoinal simulation is done
   bool m_functional_sim;
   kernel_info_t *m_functional_sim_kernel;
@@ -673,6 +781,9 @@ class gpgpu_sim : public gpgpu_t {
  public:
   bool is_functional_sim() { return m_functional_sim; }
   kernel_info_t *get_functional_kernel() { return m_functional_sim_kernel; }
+  std::vector<kernel_info_t *> get_running_kernels() {
+    return m_running_kernels;
+  }
   void functional_launch(kernel_info_t *k) {
     m_functional_sim = true;
     m_functional_sim_kernel = k;
@@ -695,4 +806,79 @@ class exec_gpgpu_sim : public gpgpu_sim {
   virtual void createSIMTCluster();
 };
 
+/**
+ * @brief A GPGPUSim class customized to SST Balar interfacing
+ *
+ */
+class sst_gpgpu_sim : public gpgpu_sim {
+ public:
+  sst_gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
+      : gpgpu_sim(config, ctx) {
+    createSIMTCluster();
+  }
+
+  // SST memory handling
+  std::vector<std::deque<mem_fetch *>>
+      SST_gpgpu_reply_buffer; /** SST mem response queue */
+
+  /**
+   * @brief Receive mem request's response from SST and put
+   *        it in a buffer (SST_gpgpu_reply_buffer)
+   *
+   * @param core_id
+   * @param mem_req
+   */
+  void SST_receive_mem_reply(unsigned core_id, void *mem_req);
+
+  /**
+   * @brief Pop the head of the buffer queue to get the
+   *        memory response
+   *
+   * @param core_id
+   * @return mem_fetch*
+   */
+  mem_fetch *SST_pop_mem_reply(unsigned core_id);
+
+  virtual void createSIMTCluster();
+
+  // SST Balar interfacing
+  /**
+   * @brief Advance core and collect stats
+   *
+   */
+  void SST_cycle();
+
+  /**
+   * @brief Wrapper of SST_cycle()
+   *
+   */
+  void cycle();
+
+  /**
+   * @brief Whether the GPU is active, removed test for
+   *        memory system since that is handled in SST
+   *
+   * @return true
+   * @return false
+   */
+  bool active();
+
+  /**
+   * @brief SST mode use SST memory system instead, so the memcpy
+   *        is empty here
+   *
+   * @param dst_start_addr
+   * @param count
+   */
+  void perf_memcpy_to_gpu(size_t dst_start_addr, size_t count){};
+
+  /**
+   * @brief Check if the SST config matches up with the
+   *        gpgpusim.config in core number
+   *
+   * @param sst_numcores SST core count
+   */
+  void SST_gpgpusim_numcores_equal_check(unsigned sst_numcores);
+};
+
 #endif
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index ab6e5c228..52eed0ef7 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -1,18 +1,21 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas,
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -48,15 +51,28 @@
 
 mem_fetch *partition_mf_allocator::alloc(new_addr_type addr,
                                          mem_access_type type, unsigned size,
-                                         bool wr,
-                                         unsigned long long cycle) const {
+                                         bool wr, unsigned long long cycle,
+                                         unsigned long long streamID) const {
   assert(wr);
   mem_access_t access(type, addr, size, wr, m_memory_config->gpgpu_ctx);
-  mem_fetch *mf = new mem_fetch(access, NULL, WRITE_PACKET_SIZE, -1, -1, -1,
-                                m_memory_config, cycle);
+  mem_fetch *mf = new mem_fetch(access, NULL, streamID, WRITE_PACKET_SIZE, -1,
+                                -1, -1, m_memory_config, cycle);
   return mf;
 }
 
+mem_fetch *partition_mf_allocator::alloc(
+    new_addr_type addr, mem_access_type type, const active_mask_t &active_mask,
+    const mem_access_byte_mask_t &byte_mask,
+    const mem_access_sector_mask_t &sector_mask, unsigned size, bool wr,
+    unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
+    mem_fetch *original_mf, unsigned long long streamID) const {
+  mem_access_t access(type, addr, size, wr, active_mask, byte_mask, sector_mask,
+                      m_memory_config->gpgpu_ctx);
+  mem_fetch *mf = new mem_fetch(access, NULL, streamID,
+                                wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid,
+                                sid, tpc, m_memory_config, cycle, original_mf);
+  return mf;
+}
 memory_partition_unit::memory_partition_unit(unsigned partition_id,
                                              const memory_config *config,
                                              class memory_stats_t *stats,
@@ -375,9 +391,10 @@ void memory_partition_unit::set_done(mem_fetch *mf) {
 
 void memory_partition_unit::set_dram_power_stats(
     unsigned &n_cmd, unsigned &n_activity, unsigned &n_nop, unsigned &n_act,
-    unsigned &n_pre, unsigned &n_rd, unsigned &n_wr, unsigned &n_req) const {
+    unsigned &n_pre, unsigned &n_rd, unsigned &n_wr, unsigned &n_wr_WB,
+    unsigned &n_req) const {
   m_dram->set_dram_power_stats(n_cmd, n_activity, n_nop, n_act, n_pre, n_rd,
-                               n_wr, n_req);
+                               n_wr, n_wr_WB, n_req);
 }
 
 void memory_partition_unit::print(FILE *fp) const {
@@ -419,9 +436,9 @@ memory_sub_partition::memory_sub_partition(unsigned sub_partition_id,
   m_mf_allocator = new partition_mf_allocator(config);
 
   if (!m_config->m_L2_config.disabled())
-    m_L2cache =
-        new l2_cache(L2c_name, m_config->m_L2_config, -1, -1, m_L2interface,
-                     m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    m_L2cache = new l2_cache(L2c_name, m_config->m_L2_config, -1, -1,
+                             m_L2interface, m_mf_allocator,
+                             IN_PARTITION_L2_MISS_QUEUE, gpu, L2_GPU_CACHE);
 
   unsigned int icnt_L2;
   unsigned int L2_dram;
@@ -541,10 +558,15 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
                m_config->m_L2_config.m_write_alloc_policy ==
                    LAZY_FETCH_ON_READ) &&
               !was_writeallocate_sent(events)) {
-            mf->set_reply();
-            mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
-                           m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
-            m_L2_icnt_queue->push(mf);
+            if (mf->get_access_type() == L1_WRBK_ACC) {
+              m_request_tracker.erase(mf);
+              delete mf;
+            } else if (m_config->m_L2_config.get_write_policy() == WRITE_BACK) {
+              mf->set_reply();
+              mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
+                             m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+              m_L2_icnt_queue->push(mf);
+            }
           }
           // L2 cache accepted request
           m_icnt_L2_queue->pop();
@@ -646,6 +668,7 @@ void gpgpu_sim::print_dram_stats(FILE *fout) const {
   unsigned pre = 0;
   unsigned rd = 0;
   unsigned wr = 0;
+  unsigned wr_WB = 0;
   unsigned req = 0;
   unsigned tot_cmd = 0;
   unsigned tot_nop = 0;
@@ -657,13 +680,13 @@ void gpgpu_sim::print_dram_stats(FILE *fout) const {
 
   for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
     m_memory_partition_unit[i]->set_dram_power_stats(cmd, activity, nop, act,
-                                                     pre, rd, wr, req);
+                                                     pre, rd, wr, wr_WB, req);
     tot_cmd += cmd;
     tot_nop += nop;
     tot_act += act;
     tot_pre += pre;
     tot_rd += rd;
-    tot_wr += wr;
+    tot_wr += wr + wr_WB;
     tot_req += req;
   }
   fprintf(fout, "gpgpu_n_dram_reads = %d\n", tot_rd);
@@ -694,71 +717,69 @@ bool memory_sub_partition::busy() const { return !m_request_tracker.empty(); }
 std::vector<mem_fetch *>
 memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
   std::vector<mem_fetch *> result;
-
+  mem_access_sector_mask_t sector_mask = mf->get_access_sector_mask();
   if (mf->get_data_size() == SECTOR_SIZE &&
       mf->get_access_sector_mask().count() == 1) {
     result.push_back(mf);
-  } else if (mf->get_data_size() == 128 || mf->get_data_size() == 64) {
-    // We only accept 32, 64 and 128 bytes reqs
-    unsigned start = 0, end = 0;
-    if (mf->get_data_size() == 128) {
+  } else if (mf->get_data_size() == MAX_MEMORY_ACCESS_SIZE) {
+    // break down every sector
+    mem_access_byte_mask_t mask;
+    for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+      for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+        mask.set(k);
+      }
+      mem_fetch *n_mf = m_mf_allocator->alloc(
+          mf->get_addr() + SECTOR_SIZE * i, mf->get_access_type(),
+          mf->get_access_warp_mask(), mf->get_access_byte_mask() & mask,
+          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE, mf->is_write(),
+          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf->get_wid(),
+          mf->get_sid(), mf->get_tpc(), mf, mf->get_streamID());
+
+      result.push_back(n_mf);
+    }
+    // This is for constant cache
+  } else if (mf->get_data_size() == 64 &&
+             (mf->get_access_sector_mask().all() ||
+              mf->get_access_sector_mask().none())) {
+    unsigned start;
+    if (mf->get_addr() % MAX_MEMORY_ACCESS_SIZE == 0)
       start = 0;
-      end = 3;
-    } else if (mf->get_data_size() == 64 &&
-               mf->get_access_sector_mask().to_string() == "1100") {
+    else
       start = 2;
-      end = 3;
-    } else if (mf->get_data_size() == 64 &&
-               mf->get_access_sector_mask().to_string() == "0011") {
-      start = 0;
-      end = 1;
-    } else if (mf->get_data_size() == 64 &&
-               (mf->get_access_sector_mask().to_string() == "1111" ||
-                mf->get_access_sector_mask().to_string() == "0000")) {
-      if (mf->get_addr() % 128 == 0) {
-        start = 0;
-        end = 1;
-      } else {
-        start = 2;
-        end = 3;
+    mem_access_byte_mask_t mask;
+    for (unsigned i = start; i < start + 2; i++) {
+      for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+        mask.set(k);
       }
-    } else {
-      printf(
-          "Invalid sector received, address = 0x%06llx, sector mask = %s, data "
-          "size = %d",
-          mf->get_addr(), mf->get_access_sector_mask(), mf->get_data_size());
-      assert(0 && "Undefined sector mask is received");
-    }
-
-    std::bitset<SECTOR_SIZE * SECTOR_CHUNCK_SIZE> byte_sector_mask;
-    byte_sector_mask.reset();
-    for (unsigned k = start * SECTOR_SIZE; k < SECTOR_SIZE; ++k)
-      byte_sector_mask.set(k);
-
-    for (unsigned j = start, i = 0; j <= end; ++j, ++i) {
-      const mem_access_t *ma = new mem_access_t(
-          mf->get_access_type(), mf->get_addr() + SECTOR_SIZE * i, SECTOR_SIZE,
-          mf->is_write(), mf->get_access_warp_mask(),
-          mf->get_access_byte_mask() & byte_sector_mask,
-          std::bitset<SECTOR_CHUNCK_SIZE>().set(j), m_gpu->gpgpu_ctx);
-
-      mem_fetch *n_mf =
-          new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
-                        mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
-                        m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
+      mem_fetch *n_mf = m_mf_allocator->alloc(
+          mf->get_addr(), mf->get_access_type(), mf->get_access_warp_mask(),
+          mf->get_access_byte_mask() & mask,
+          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE, mf->is_write(),
+          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf->get_wid(),
+          mf->get_sid(), mf->get_tpc(), mf, mf->get_streamID());
 
       result.push_back(n_mf);
-      byte_sector_mask <<= SECTOR_SIZE;
     }
   } else {
-    printf(
-        "Invalid sector received, address = 0x%06llx, sector mask = %d, byte "
-        "mask = , data size = %u",
-        mf->get_addr(), mf->get_access_sector_mask().count(),
-        mf->get_data_size());
-    assert(0 && "Undefined data size is received");
+    for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+      if (sector_mask.test(i)) {
+        mem_access_byte_mask_t mask;
+        for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+          mask.set(k);
+        }
+        mem_fetch *n_mf = m_mf_allocator->alloc(
+            mf->get_addr() + SECTOR_SIZE * i, mf->get_access_type(),
+            mf->get_access_warp_mask(), mf->get_access_byte_mask() & mask,
+            std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE,
+            mf->is_write(), m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf,
+            mf->get_streamID());
+
+        result.push_back(n_mf);
+      }
+    }
   }
-
+  if (result.size() == 0) assert(0 && "no mf sent");
   return result;
 }
 
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 3152db337..65c9c38b3 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -1,18 +1,21 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas,
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -49,8 +52,16 @@ class partition_mf_allocator : public mem_fetch_allocator {
     return NULL;
   }
   virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
-                           unsigned size, bool wr,
-                           unsigned long long cycle) const;
+                           unsigned size, bool wr, unsigned long long cycle,
+                           unsigned long long streamID) const;
+  virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
+                           const active_mask_t &active_mask,
+                           const mem_access_byte_mask_t &byte_mask,
+                           const mem_access_sector_mask_t &sector_mask,
+                           unsigned size, bool wr, unsigned long long cycle,
+                           unsigned wid, unsigned sid, unsigned tpc,
+                           mem_fetch *original_mf,
+                           unsigned long long streamID) const;
 
  private:
   const memory_config *m_memory_config;
@@ -88,7 +99,7 @@ class memory_partition_unit {
   // Power model
   void set_dram_power_stats(unsigned &n_cmd, unsigned &n_activity,
                             unsigned &n_nop, unsigned &n_act, unsigned &n_pre,
-                            unsigned &n_rd, unsigned &n_wr,
+                            unsigned &n_rd, unsigned &n_wr, unsigned &n_wr_WB,
                             unsigned &n_req) const;
 
   int global_sub_partition_id_to_local_id(int global_sub_partition_id) const;
diff --git a/src/gpgpu-sim/local_interconnect.cc b/src/gpgpu-sim/local_interconnect.cc
index 0e204623b..7e1ab5b04 100644
--- a/src/gpgpu-sim/local_interconnect.cc
+++ b/src/gpgpu-sim/local_interconnect.cc
@@ -148,8 +148,8 @@ void xbar_router::RR_Advance() {
       }
     }
   }
-
-  next_node_id = (++next_node_id % total_nodes);
+  next_node_id = next_node_id + 1;
+  next_node_id = (next_node_id % total_nodes);
 
   conflicts += conflict_sub;
   if (active) {
@@ -159,8 +159,8 @@ void xbar_router::RR_Advance() {
   }
 
   if (verbose) {
-    printf("%d : cycle %d : conflicts = %d\n", m_id, cycles, conflict_sub);
-    printf("%d : cycle %d : passing reqs = %d\n", m_id, cycles, reqs);
+    printf("%d : cycle %llu : conflicts = %d\n", m_id, cycles, conflict_sub);
+    printf("%d : cycle %llu : passing reqs = %d\n", m_id, cycles, reqs);
   }
 
   // collect some stats about buffer util
@@ -217,7 +217,7 @@ void xbar_router::iSLIP_Advance() {
             out_buffers[_packet.output_deviceID].push(_packet);
             in_buffers[node_id].pop();
             if (verbose)
-              printf("%d : cycle %d : send req from %d to %d\n", m_id, cycles,
+              printf("%d : cycle %llu : send req from %d to %d\n", m_id, cycles,
                      node_id, i - _n_shader);
             if (grant_cycles_count == 1)
               next_node[i] = (++node_id % total_nodes);
@@ -228,7 +228,7 @@ void xbar_router::iSLIP_Advance() {
                   Packet _packet2 = in_buffers[node_id2].front();
 
                   if (_packet2.output_deviceID == i)
-                    printf("%d : cycle %d : cannot send req from %d to %d\n",
+                    printf("%d : cycle %llu : cannot send req from %d to %d\n",
                            m_id, cycles, node_id2, i - _n_shader);
                 }
               }
@@ -248,7 +248,7 @@ void xbar_router::iSLIP_Advance() {
   }
 
   if (verbose)
-    printf("%d : cycle %d : grant_cycles = %d\n", m_id, cycles, grant_cycles);
+    printf("%d : cycle %llu : grant_cycles = %d\n", m_id, cycles, grant_cycles);
 
   if (active && grant_cycles_count == 1)
     grant_cycles_count = grant_cycles;
@@ -256,8 +256,8 @@ void xbar_router::iSLIP_Advance() {
     grant_cycles_count--;
 
   if (verbose) {
-    printf("%d : cycle %d : conflicts = %d\n", m_id, cycles, conflict_sub);
-    printf("%d : cycle %d : passing reqs = %d\n", m_id, cycles, reqs);
+    printf("%d : cycle %llu : conflicts = %d\n", m_id, cycles, conflict_sub);
+    printf("%d : cycle %llu : passing reqs = %d\n", m_id, cycles, reqs);
   }
 
   // collect some stats about buffer util
diff --git a/src/gpgpu-sim/mem_fetch.cc b/src/gpgpu-sim/mem_fetch.cc
index 456d891dd..809c92081 100644
--- a/src/gpgpu-sim/mem_fetch.cc
+++ b/src/gpgpu-sim/mem_fetch.cc
@@ -35,10 +35,10 @@
 unsigned mem_fetch::sm_next_mf_request_uid = 1;
 
 mem_fetch::mem_fetch(const mem_access_t &access, const warp_inst_t *inst,
-                     unsigned ctrl_size, unsigned wid, unsigned sid,
-                     unsigned tpc, const memory_config *config,
-                     unsigned long long cycle, mem_fetch *m_original_mf,
-                     mem_fetch *m_original_wr_mf)
+                     unsigned long long streamID, unsigned ctrl_size,
+                     unsigned wid, unsigned sid, unsigned tpc,
+                     const memory_config *config, unsigned long long cycle,
+                     mem_fetch *m_original_mf, mem_fetch *m_original_wr_mf)
     : m_access(access)
 
 {
@@ -48,14 +48,21 @@ mem_fetch::mem_fetch(const mem_access_t &access, const warp_inst_t *inst,
     m_inst = *inst;
     assert(wid == m_inst.warp_id());
   }
+  m_streamID = streamID;
   m_data_size = access.get_size();
   m_ctrl_size = ctrl_size;
   m_sid = sid;
   m_tpc = tpc;
   m_wid = wid;
-  config->m_address_mapping.addrdec_tlx(access.get_addr(), &m_raw_addr);
-  m_partition_addr =
-      config->m_address_mapping.partition_address(access.get_addr());
+
+  if (!config->is_SST_mode()) {
+    // In SST memory model, the SST memory hierarchy is
+    // responsible to generate the correct address mapping
+    config->m_address_mapping.addrdec_tlx(access.get_addr(), &m_raw_addr);
+    m_partition_addr =
+        config->m_address_mapping.partition_address(access.get_addr());
+  }
+
   m_type = m_access.is_write() ? WRITE_REQUEST : READ_REQUEST;
   m_timestamp = cycle;
   m_timestamp2 = 0;
@@ -84,10 +91,10 @@ mem_fetch::~mem_fetch() { m_status = MEM_FETCH_DELETED; }
 #undef MF_TUP_END
 
 void mem_fetch::print(FILE *fp, bool print_inst) const {
-  if (this == NULL) {
-    fprintf(fp, " <NULL mem_fetch pointer>\n");
-    return;
-  }
+  // if (this == NULL) { // doenst make sense!
+  //   fprintf(fp, " <NULL mem_fetch pointer>\n");
+  //   return;
+  // }
   fprintf(fp, "  mf: uid=%6u, sid%02u:w%02u, part=%u, ", m_request_uid, m_sid,
           m_wid, m_raw_addr.chip);
   m_access.print(fp);
diff --git a/src/gpgpu-sim/mem_fetch.h b/src/gpgpu-sim/mem_fetch.h
index e039846e3..770421822 100644
--- a/src/gpgpu-sim/mem_fetch.h
+++ b/src/gpgpu-sim/mem_fetch.h
@@ -54,9 +54,10 @@ class memory_config;
 class mem_fetch {
  public:
   mem_fetch(const mem_access_t &access, const warp_inst_t *inst,
-            unsigned ctrl_size, unsigned wid, unsigned sid, unsigned tpc,
-            const memory_config *config, unsigned long long cycle,
-            mem_fetch *original_mf = NULL, mem_fetch *original_wr_mf = NULL);
+            unsigned long long streamID, unsigned ctrl_size, unsigned wid,
+            unsigned sid, unsigned tpc, const memory_config *config,
+            unsigned long long cycle, mem_fetch *original_mf = NULL,
+            mem_fetch *original_wr_mf = NULL);
   ~mem_fetch();
 
   void set_status(enum mem_fetch_status status, unsigned long long cycle);
@@ -77,7 +78,7 @@ class mem_fetch {
 
   const addrdec_t &get_tlx_addr() const { return m_raw_addr; }
   void set_chip(unsigned chip_id) { m_raw_addr.chip = chip_id; }
-  void set_parition(unsigned sub_partition_id) {
+  void set_partition(unsigned sub_partition_id) {
     m_raw_addr.sub_partition = sub_partition_id;
   }
   unsigned get_data_size() const { return m_data_size; }
@@ -105,6 +106,7 @@ class mem_fetch {
   unsigned get_timestamp() const { return m_timestamp; }
   unsigned get_return_timestamp() const { return m_timestamp2; }
   unsigned get_icnt_receive_time() const { return m_icnt_receive_time; }
+  unsigned long long get_streamID() const { return m_streamID; }
 
   enum mem_access_type get_access_type() const { return m_access.get_type(); }
   const active_mask_t &get_access_warp_mask() const {
@@ -163,6 +165,8 @@ class mem_fetch {
   // requesting instruction (put last so mem_fetch prints nicer in gdb)
   warp_inst_t m_inst;
 
+  unsigned long long m_streamID;
+
   static unsigned sm_next_mf_request_uid;
 
   const memory_config *m_mem_config;
diff --git a/src/gpgpu-sim/mem_latency_stat.cc b/src/gpgpu-sim/mem_latency_stat.cc
index 63d7ee80c..c77a68648 100644
--- a/src/gpgpu-sim/mem_latency_stat.cc
+++ b/src/gpgpu-sim/mem_latency_stat.cc
@@ -203,7 +203,15 @@ unsigned memory_stats_t::memlatstat_done(mem_fetch *mf) {
 }
 
 void memory_stats_t::memlatstat_read_done(mem_fetch *mf) {
-  if (m_memory_config->gpgpu_memlatency_stat) {
+  if (m_memory_config->SST_mode) {
+    // in SST mode, we just calculate mem latency
+    unsigned mf_latency;
+    mf_latency =
+        (m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle) - mf->get_timestamp();
+    num_mfs++;
+    mf_total_lat += mf_latency;
+    if (mf_latency > max_mf_latency) max_mf_latency = mf_latency;
+  } else if (m_memory_config->gpgpu_memlatency_stat) {
     unsigned mf_latency = memlatstat_done(mf);
     if (mf_latency >
         mf_max_lat_table[mf->get_tlx_addr().chip][mf->get_tlx_addr().bk])
@@ -273,7 +281,12 @@ void memory_stats_t::memlatstat_print(unsigned n_mem, unsigned gpu_mem_n_bk) {
   unsigned max_bank_accesses, min_bank_accesses, max_chip_accesses,
       min_chip_accesses;
 
-  if (m_memory_config->gpgpu_memlatency_stat) {
+  if (m_memory_config->SST_mode) {
+    // in SST mode, we just calculate mem latency
+    printf("max_mem_SST_latency = %d \n", max_mf_latency);
+    if (num_mfs)
+      printf("average_mf_SST_latency = %lld \n", mf_total_lat / num_mfs);
+  } else if (m_memory_config->gpgpu_memlatency_stat) {
     printf("maxmflatency = %d \n", max_mf_latency);
     printf("max_icnt2mem_latency = %d \n", max_icnt2mem_latency);
     printf("maxmrqlatency = %d \n", max_mrq_latency);
diff --git a/src/gpgpu-sim/power_interface.cc b/src/gpgpu-sim/power_interface.cc
index c637d846f..cddb6e987 100644
--- a/src/gpgpu-sim/power_interface.cc
+++ b/src/gpgpu-sim/power_interface.cc
@@ -1,18 +1,21 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler
+// Hetherington, Vijay Kandiah, Nikos Hardavellas, Mahmoud Khairy, Junrui Pan,
+// Timothy G. Rogers The University of British Columbia, Northwestern
+// University, Purdue University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -38,7 +41,9 @@ void init_mcpat(const gpgpu_sim_config &config,
       config.g_power_simulation_enabled, config.g_power_trace_enabled,
       config.g_steady_power_levels_enabled, config.g_power_per_cycle_dump,
       config.gpu_steady_power_deviation, config.gpu_steady_min_period,
-      config.g_power_trace_zlevel, tot_inst + inst, stat_sample_freq);
+      config.g_power_trace_zlevel, tot_inst + inst, stat_sample_freq,
+      config.g_power_simulation_mode, config.g_dvfs_enabled,
+      config.get_core_freq() / 1000000, config.num_shader());
 }
 
 void mcpat_cycle(const gpgpu_sim_config &config,
@@ -46,7 +51,7 @@ void mcpat_cycle(const gpgpu_sim_config &config,
                  class gpgpu_sim_wrapper *wrapper,
                  class power_stat_t *power_stats, unsigned stat_sample_freq,
                  unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
-                 unsigned inst) {
+                 unsigned inst, bool dvfs_enabled) {
   static bool mcpat_init = true;
 
   if (mcpat_init) {  // If first cycle, don't have any power numbers yet
@@ -55,41 +60,49 @@ void mcpat_cycle(const gpgpu_sim_config &config,
   }
 
   if ((tot_cycle + cycle) % stat_sample_freq == 0) {
+    if (dvfs_enabled) {
+      wrapper->set_model_voltage(1);  // performance model needs to support
+                                      // this.
+    }
+
     wrapper->set_inst_power(
         shdr_config->gpgpu_clock_gated_lanes, stat_sample_freq,
-        stat_sample_freq, power_stats->get_total_inst(),
-        power_stats->get_total_int_inst(), power_stats->get_total_fp_inst(),
-        power_stats->get_l1d_read_accesses(),
-        power_stats->get_l1d_write_accesses(),
-        power_stats->get_committed_inst());
+        stat_sample_freq, power_stats->get_total_inst(0),
+        power_stats->get_total_int_inst(0), power_stats->get_total_fp_inst(0),
+        power_stats->get_l1d_read_accesses(0),
+        power_stats->get_l1d_write_accesses(0),
+        power_stats->get_committed_inst(0));
 
     // Single RF for both int and fp ops
-    wrapper->set_regfile_power(power_stats->get_regfile_reads(),
-                               power_stats->get_regfile_writes(),
-                               power_stats->get_non_regfile_operands());
+    wrapper->set_regfile_power(power_stats->get_regfile_reads(0),
+                               power_stats->get_regfile_writes(0),
+                               power_stats->get_non_regfile_operands(0));
 
     // Instruction cache stats
-    wrapper->set_icache_power(power_stats->get_inst_c_hits(),
-                              power_stats->get_inst_c_misses());
+    wrapper->set_icache_power(power_stats->get_inst_c_hits(0),
+                              power_stats->get_inst_c_misses(0));
 
     // Constant Cache, shared memory, texture cache
-    wrapper->set_ccache_power(power_stats->get_constant_c_hits(),
-                              power_stats->get_constant_c_misses());
+    wrapper->set_ccache_power(
+        power_stats->get_const_accessess(0),
+        0);  // assuming all HITS in constant cache for now
     wrapper->set_tcache_power(power_stats->get_texture_c_hits(),
                               power_stats->get_texture_c_misses());
-    wrapper->set_shrd_mem_power(power_stats->get_shmem_read_access());
+    wrapper->set_shrd_mem_power(power_stats->get_shmem_access(0));
 
-    wrapper->set_l1cache_power(
-        power_stats->get_l1d_read_hits(), power_stats->get_l1d_read_misses(),
-        power_stats->get_l1d_write_hits(), power_stats->get_l1d_write_misses());
+    wrapper->set_l1cache_power(power_stats->get_l1d_read_hits(0),
+                               power_stats->get_l1d_read_misses(0),
+                               power_stats->get_l1d_write_hits(0),
+                               power_stats->get_l1d_write_misses(0));
 
     wrapper->set_l2cache_power(
-        power_stats->get_l2_read_hits(), power_stats->get_l2_read_misses(),
-        power_stats->get_l2_write_hits(), power_stats->get_l2_write_misses());
+        power_stats->get_l2_read_hits(0), power_stats->get_l2_read_misses(0),
+        power_stats->get_l2_write_hits(0), power_stats->get_l2_write_misses(0));
 
     float active_sms = (*power_stats->m_active_sms) / stat_sample_freq;
     float num_cores = shdr_config->num_shader();
     float num_idle_core = num_cores - active_sms;
+    wrapper->set_num_cores(num_cores);
     wrapper->set_idle_core_power(num_idle_core);
 
     // pipeline power - pipeline_duty_cycle *= percent_active_sms;
@@ -101,37 +114,60 @@ void mcpat_cycle(const gpgpu_sim_config &config,
     wrapper->set_duty_cycle_power(pipeline_duty_cycle);
 
     // Memory Controller
-    wrapper->set_mem_ctrl_power(power_stats->get_dram_rd(),
-                                power_stats->get_dram_wr(),
-                                power_stats->get_dram_pre());
+    wrapper->set_mem_ctrl_power(power_stats->get_dram_rd(0),
+                                power_stats->get_dram_wr(0),
+                                power_stats->get_dram_pre(0));
 
     // Execution pipeline accesses
     // FPU (SP) accesses, Integer ALU (not present in Tesla), Sfu accesses
-    wrapper->set_exec_unit_power(power_stats->get_tot_fpu_accessess(),
-                                 power_stats->get_ialu_accessess(),
-                                 power_stats->get_tot_sfu_accessess());
+
+    wrapper->set_int_accesses(power_stats->get_ialu_accessess(0),
+                              power_stats->get_intmul24_accessess(0),
+                              power_stats->get_intmul32_accessess(0),
+                              power_stats->get_intmul_accessess(0),
+                              power_stats->get_intdiv_accessess(0));
+
+    wrapper->set_dp_accesses(power_stats->get_dp_accessess(0),
+                             power_stats->get_dpmul_accessess(0),
+                             power_stats->get_dpdiv_accessess(0));
+
+    wrapper->set_fp_accesses(power_stats->get_fp_accessess(0),
+                             power_stats->get_fpmul_accessess(0),
+                             power_stats->get_fpdiv_accessess(0));
+
+    wrapper->set_trans_accesses(
+        power_stats->get_sqrt_accessess(0), power_stats->get_log_accessess(0),
+        power_stats->get_sin_accessess(0), power_stats->get_exp_accessess(0));
+
+    wrapper->set_tensor_accesses(power_stats->get_tensor_accessess(0));
+
+    wrapper->set_tex_accesses(power_stats->get_tex_accessess(0));
+
+    wrapper->set_exec_unit_power(power_stats->get_tot_fpu_accessess(0),
+                                 power_stats->get_ialu_accessess(0),
+                                 power_stats->get_tot_sfu_accessess(0));
+
+    wrapper->set_avg_active_threads(power_stats->get_active_threads(0));
 
     // Average active lanes for sp and sfu pipelines
     float avg_sp_active_lanes =
         (power_stats->get_sp_active_lanes()) / stat_sample_freq;
     float avg_sfu_active_lanes =
         (power_stats->get_sfu_active_lanes()) / stat_sample_freq;
+    if (avg_sp_active_lanes > 32.0) avg_sp_active_lanes = 32.0;
+    if (avg_sfu_active_lanes > 32.0) avg_sfu_active_lanes = 32.0;
     assert(avg_sp_active_lanes <= 32);
     assert(avg_sfu_active_lanes <= 32);
-    wrapper->set_active_lanes_power(
-        (power_stats->get_sp_active_lanes()) / stat_sample_freq,
-        (power_stats->get_sfu_active_lanes()) / stat_sample_freq);
-
-    double n_icnt_simt_to_mem =
-        (double)
-            power_stats->get_icnt_simt_to_mem();  // # flits from SIMT clusters
-                                                  // to memory partitions
-    double n_icnt_mem_to_simt =
-        (double)
-            power_stats->get_icnt_mem_to_simt();  // # flits from memory
-                                                  // partitions to SIMT clusters
+    wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
+
+    double n_icnt_simt_to_mem = (double)power_stats->get_icnt_simt_to_mem(
+        0);  // # flits from SIMT clusters
+             // to memory partitions
+    double n_icnt_mem_to_simt = (double)power_stats->get_icnt_mem_to_simt(
+        0);  // # flits from memory
+             // partitions to SIMT clusters
     wrapper->set_NoC_power(
-        n_icnt_mem_to_simt,
+        n_icnt_mem_to_simt +
         n_icnt_simt_to_mem);  // Number of flits traversing the interconnect
 
     wrapper->compute();
@@ -152,3 +188,381 @@ void mcpat_cycle(const gpgpu_sim_config &config,
 void mcpat_reset_perf_count(class gpgpu_sim_wrapper *wrapper) {
   wrapper->reset_counters();
 }
+
+bool parse_hw_file(char *hwpowerfile, bool find_target_kernel,
+                   vector<string> &hw_data, char *benchname,
+                   std::string executed_kernelname) {
+  fstream hw_file;
+  hw_file.open(hwpowerfile, ios::in);
+  string line, word, temp;
+  while (!hw_file.eof()) {
+    hw_data.clear();
+    getline(hw_file, line);
+    stringstream s(line);
+    while (getline(s, word, ',')) {
+      hw_data.push_back(word);
+    }
+    if (hw_data[HW_BENCH_NAME] == std::string(benchname)) {
+      if (find_target_kernel) {
+        if (hw_data[HW_KERNEL_NAME] == "") {
+          hw_file.close();
+          return true;
+        } else {
+          if (hw_data[HW_KERNEL_NAME] == executed_kernelname) {
+            hw_file.close();
+            return true;
+          }
+        }
+      } else {
+        hw_file.close();
+        return true;
+      }
+    }
+  }
+  hw_file.close();
+  return false;
+}
+
+void calculate_hw_mcpat(
+    const gpgpu_sim_config &config, const shader_core_config *shdr_config,
+    class gpgpu_sim_wrapper *wrapper, class power_stat_t *power_stats,
+    unsigned stat_sample_freq, unsigned tot_cycle, unsigned cycle,
+    unsigned tot_inst, unsigned inst, int power_simulation_mode,
+    bool dvfs_enabled, char *hwpowerfile, char *benchname,
+    std::string executed_kernelname,
+    const bool *accelwattch_hybrid_configuration, bool aggregate_power_stats) {
+  /* Reading HW data from CSV file */
+
+  vector<string> hw_data;
+  bool kernel_found = false;
+  kernel_found = parse_hw_file(
+      hwpowerfile, true, hw_data, benchname,
+      executed_kernelname);  // Searching for matching executed_kernelname.
+  if (!kernel_found)
+    kernel_found = parse_hw_file(
+        hwpowerfile, false, hw_data, benchname,
+        executed_kernelname);  // Searching for any kernel with same benchname.
+  assert(
+      "Could not find perf stats for the target benchmark in hwpowerfile.\n" &&
+      (kernel_found));
+  unsigned perf_cycles =
+      static_cast<unsigned int>(std::stod(hw_data[HW_CYCLES]) + 0.5);
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_CYCLES]))
+    perf_cycles = cycle;
+  wrapper->init_mcpat_hw_mode(
+      perf_cycles);  // total PERF MODEL cycles for current kernel
+
+  if (dvfs_enabled) {
+    if ((power_simulation_mode == 2) &&
+        (accelwattch_hybrid_configuration[HW_VOLTAGE]))
+      wrapper->set_model_voltage(1);  // performance model needs to support this
+    else
+      wrapper->set_model_voltage(std::stod(
+          hw_data[HW_VOLTAGE]));  // performance model needs to support this
+  }
+
+  double l1_read_hits = std::stod(hw_data[HW_L1_RH]);
+  double l1_read_misses = std::stod(hw_data[HW_L1_RM]);
+  double l1_write_hits = std::stod(hw_data[HW_L1_WH]);
+  double l1_write_misses = std::stod(hw_data[HW_L1_WM]);
+
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L1_RH]))
+    l1_read_hits =
+        power_stats->get_l1d_read_hits(1) - power_stats->l1r_hits_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L1_RM]))
+    l1_read_misses =
+        power_stats->get_l1d_read_misses(1) - power_stats->l1r_misses_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L1_WH]))
+    l1_write_hits =
+        power_stats->get_l1d_write_hits(1) - power_stats->l1w_hits_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L1_WM]))
+    l1_write_misses =
+        power_stats->get_l1d_write_misses(1) - power_stats->l1w_misses_kernel;
+
+  if (aggregate_power_stats) {
+    power_stats->tot_inst_execution += power_stats->get_total_inst(1);
+    power_stats->tot_int_inst_execution += power_stats->get_total_int_inst(1);
+    power_stats->tot_fp_inst_execution += power_stats->get_total_fp_inst(1);
+    power_stats->commited_inst_execution += power_stats->get_committed_inst(1);
+    wrapper->set_inst_power(
+        shdr_config->gpgpu_clock_gated_lanes,
+        cycle,  // TODO: core.[0] cycles counts don't matter, remove this
+        cycle, power_stats->tot_inst_execution,
+        power_stats->tot_int_inst_execution, power_stats->tot_fp_inst_execution,
+        l1_read_hits + l1_read_misses, l1_write_hits + l1_write_misses,
+        power_stats->commited_inst_execution);
+  } else {
+    wrapper->set_inst_power(
+        shdr_config->gpgpu_clock_gated_lanes,
+        cycle,  // TODO: core.[0] cycles counts don't matter, remove this
+        cycle, power_stats->get_total_inst(1),
+        power_stats->get_total_int_inst(1), power_stats->get_total_fp_inst(1),
+        l1_read_hits + l1_read_misses, l1_write_hits + l1_write_misses,
+        power_stats->get_committed_inst(1));
+  }
+
+  // Single RF for both int and fp ops -- activity factor set to 0 for
+  // Accelwattch HW and Accelwattch Hybrid because no HW Perf Stats for register
+  // files
+  wrapper->set_regfile_power(power_stats->get_regfile_reads(1),
+                             power_stats->get_regfile_writes(1),
+                             power_stats->get_non_regfile_operands(1));
+
+  // Instruction cache stats -- activity factor set to 0 for Accelwattch HW and
+  // Accelwattch Hybrid because no HW Perf Stats for instruction cache
+  wrapper->set_icache_power(
+      power_stats->get_inst_c_hits(1) - power_stats->l1i_hits_kernel,
+      power_stats->get_inst_c_misses(1) - power_stats->l1i_misses_kernel);
+
+  // Constant Cache, shared memory, texture cache
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_CC_ACC]))
+    wrapper->set_ccache_power(
+        power_stats->get_const_accessess(1) - power_stats->cc_accesses_kernel,
+        0);  // assuming all HITS in constant cache for now
+  else
+    wrapper->set_ccache_power(
+        std::stod(hw_data[HW_CC_ACC]),
+        0);  // assuming all HITS in constant cache for now
+
+  // wrapper->set_tcache_power(power_stats->get_texture_c_hits(),
+  //                           power_stats->get_texture_c_misses());
+
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_SHRD_ACC]))
+    wrapper->set_shrd_mem_power(power_stats->get_shmem_access(1) -
+                                power_stats->shared_accesses_kernel);
+  else
+    wrapper->set_shrd_mem_power(std::stod(hw_data[HW_SHRD_ACC]));
+
+  wrapper->set_l1cache_power(l1_read_hits, l1_read_misses, l1_write_hits,
+                             l1_write_misses);
+
+  double l2_read_hits = std::stod(hw_data[HW_L2_RH]);
+  double l2_read_misses = std::stod(hw_data[HW_L2_RM]);
+  double l2_write_hits = std::stod(hw_data[HW_L2_WH]);
+  double l2_write_misses = std::stod(hw_data[HW_L2_WM]);
+
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L2_RH]))
+    l2_read_hits =
+        power_stats->get_l2_read_hits(1) - power_stats->l2r_hits_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L2_RM]))
+    l2_read_misses =
+        power_stats->get_l2_read_misses(1) - power_stats->l2r_misses_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L2_WH]))
+    l2_write_hits =
+        power_stats->get_l2_write_hits(1) - power_stats->l2w_hits_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L2_WM]))
+    l2_write_misses =
+        power_stats->get_l2_write_misses(1) - power_stats->l2w_misses_kernel;
+
+  wrapper->set_l2cache_power(l2_read_hits, l2_read_misses, l2_write_hits,
+                             l2_write_misses);
+
+  float active_sms = (*power_stats->m_active_sms) / stat_sample_freq;
+  float num_cores = shdr_config->num_shader();
+  float num_idle_core = num_cores - active_sms;
+  wrapper->set_num_cores(num_cores);
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_NUM_SM_IDLE]))
+    wrapper->set_idle_core_power(num_idle_core);
+  else
+    wrapper->set_idle_core_power(std::stod(hw_data[HW_NUM_SM_IDLE]));
+
+  float pipeline_duty_cycle =
+      ((*power_stats->m_average_pipeline_duty_cycle / (stat_sample_freq)) < 0.8)
+          ? ((*power_stats->m_average_pipeline_duty_cycle) / stat_sample_freq)
+          : 0.8;
+
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_PIPE_DUTY]))
+    wrapper->set_duty_cycle_power(pipeline_duty_cycle);
+  else
+    wrapper->set_duty_cycle_power(std::stod(hw_data[HW_PIPE_DUTY]));
+
+  // Memory Controller
+
+  double dram_reads = std::stod(hw_data[HW_DRAM_RD]);
+  double dram_writes = std::stod(hw_data[HW_DRAM_WR]);
+  double dram_pre = 0;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_DRAM_RD]))
+    dram_reads = power_stats->get_dram_rd(1) - power_stats->dram_rd_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_DRAM_WR]))
+    dram_writes = power_stats->get_dram_wr(1) - power_stats->dram_wr_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_DRAM_RD]))
+    dram_pre = power_stats->get_dram_pre(1) - power_stats->dram_pre_kernel;
+
+  wrapper->set_mem_ctrl_power(dram_reads, dram_writes, dram_pre);
+
+  if (aggregate_power_stats) {
+    power_stats->ialu_acc_execution += power_stats->get_ialu_accessess(1);
+    power_stats->imul24_acc_execution += power_stats->get_intmul24_accessess(1);
+    power_stats->imul32_acc_execution += power_stats->get_intmul32_accessess(1);
+    power_stats->imul_acc_execution += power_stats->get_intmul_accessess(1);
+    power_stats->idiv_acc_execution += power_stats->get_intdiv_accessess(1);
+    power_stats->dp_acc_execution += power_stats->get_dp_accessess(1);
+    power_stats->dpmul_acc_execution += power_stats->get_dpmul_accessess(1);
+    power_stats->dpdiv_acc_execution += power_stats->get_dpdiv_accessess(1);
+    power_stats->fp_acc_execution += power_stats->get_fp_accessess(1);
+    power_stats->fpmul_acc_execution += power_stats->get_fpmul_accessess(1);
+    power_stats->fpdiv_acc_execution += power_stats->get_fpdiv_accessess(1);
+    power_stats->sqrt_acc_execution += power_stats->get_sqrt_accessess(1);
+    power_stats->log_acc_execution += power_stats->get_log_accessess(1);
+    power_stats->sin_acc_execution += power_stats->get_sin_accessess(1);
+    power_stats->exp_acc_execution += power_stats->get_exp_accessess(1);
+    power_stats->tensor_acc_execution += power_stats->get_tensor_accessess(1);
+    power_stats->tex_acc_execution += power_stats->get_tex_accessess(1);
+    power_stats->tot_fpu_acc_execution += power_stats->get_tot_fpu_accessess(1);
+    power_stats->tot_sfu_acc_execution += power_stats->get_tot_sfu_accessess(1);
+    power_stats->tot_threads_acc_execution +=
+        power_stats->get_tot_threads_kernel(1);
+    power_stats->tot_warps_acc_execution +=
+        power_stats->get_tot_warps_kernel(1);
+
+    power_stats->sp_active_lanes_execution +=
+        (power_stats->get_sp_active_lanes() * shdr_config->num_shader() *
+         shdr_config->gpgpu_num_sp_units);
+    power_stats->sfu_active_lanes_execution +=
+        (power_stats->get_sfu_active_lanes() * shdr_config->num_shader() *
+         shdr_config->gpgpu_num_sp_units);
+
+    wrapper->set_int_accesses(
+        power_stats->ialu_acc_execution, power_stats->imul24_acc_execution,
+        power_stats->imul32_acc_execution, power_stats->imul_acc_execution,
+        power_stats->idiv_acc_execution);
+
+    wrapper->set_dp_accesses(power_stats->dp_acc_execution,
+                             power_stats->dpmul_acc_execution,
+                             power_stats->dpdiv_acc_execution);
+
+    wrapper->set_fp_accesses(power_stats->fp_acc_execution,
+                             power_stats->fpmul_acc_execution,
+                             power_stats->fpdiv_acc_execution);
+
+    wrapper->set_trans_accesses(
+        power_stats->sqrt_acc_execution, power_stats->log_acc_execution,
+        power_stats->sin_acc_execution, power_stats->exp_acc_execution);
+
+    wrapper->set_tensor_accesses(power_stats->tensor_acc_execution);
+
+    wrapper->set_tex_accesses(power_stats->tex_acc_execution);
+
+    wrapper->set_exec_unit_power(power_stats->ialu_acc_execution,
+                                 power_stats->tot_fpu_acc_execution,
+                                 power_stats->tot_sfu_acc_execution);
+
+    wrapper->set_avg_active_threads(
+        (double)((double)power_stats->tot_threads_acc_execution /
+                 (double)power_stats->tot_warps_acc_execution));
+
+    // Average active lanes for sp and sfu pipelines
+    float avg_sp_active_lanes =
+        (power_stats->sp_active_lanes_execution) / shdr_config->num_shader() /
+        shdr_config->gpgpu_num_sp_units / stat_sample_freq;
+    float avg_sfu_active_lanes =
+        (power_stats->sfu_active_lanes_execution) / shdr_config->num_shader() /
+        shdr_config->gpgpu_num_sp_units / stat_sample_freq;
+    if (avg_sp_active_lanes > 32.0) avg_sp_active_lanes = 32.0;
+    if (avg_sfu_active_lanes > 32.0) avg_sfu_active_lanes = 32.0;
+    assert(avg_sp_active_lanes <= 32);
+    assert(avg_sfu_active_lanes <= 32);
+    wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
+  } else {
+    wrapper->set_int_accesses(power_stats->get_ialu_accessess(1),
+                              power_stats->get_intmul24_accessess(1),
+                              power_stats->get_intmul32_accessess(1),
+                              power_stats->get_intmul_accessess(1),
+                              power_stats->get_intdiv_accessess(1));
+
+    wrapper->set_dp_accesses(power_stats->get_dp_accessess(1),
+                             power_stats->get_dpmul_accessess(1),
+                             power_stats->get_dpdiv_accessess(1));
+
+    wrapper->set_fp_accesses(power_stats->get_fp_accessess(1),
+                             power_stats->get_fpmul_accessess(1),
+                             power_stats->get_fpdiv_accessess(1));
+
+    wrapper->set_trans_accesses(
+        power_stats->get_sqrt_accessess(1), power_stats->get_log_accessess(1),
+        power_stats->get_sin_accessess(1), power_stats->get_exp_accessess(1));
+
+    wrapper->set_tensor_accesses(power_stats->get_tensor_accessess(1));
+
+    wrapper->set_tex_accesses(power_stats->get_tex_accessess(1));
+
+    wrapper->set_exec_unit_power(power_stats->get_tot_fpu_accessess(1),
+                                 power_stats->get_ialu_accessess(1),
+                                 power_stats->get_tot_sfu_accessess(1));
+
+    wrapper->set_avg_active_threads(power_stats->get_active_threads(1));
+
+    // Average active lanes for sp and sfu pipelines
+    float avg_sp_active_lanes =
+        (power_stats->get_sp_active_lanes()) / stat_sample_freq;
+    float avg_sfu_active_lanes =
+        (power_stats->get_sfu_active_lanes()) / stat_sample_freq;
+    if (avg_sp_active_lanes > 32.0) avg_sp_active_lanes = 32.0;
+    if (avg_sfu_active_lanes > 32.0) avg_sfu_active_lanes = 32.0;
+    assert(avg_sp_active_lanes <= 32);
+    assert(avg_sfu_active_lanes <= 32);
+    wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
+  }
+
+  double n_icnt_simt_to_mem =
+      (double)(power_stats->get_icnt_simt_to_mem(1) -
+               power_stats->noc_tr_kernel);  // # flits from SIMT clusters
+                                             // to memory partitions
+  double n_icnt_mem_to_simt =
+      (double)(power_stats->get_icnt_mem_to_simt(1) -
+               power_stats->noc_rc_kernel);  // # flits from memory
+                                             // partitions to SIMT clusters
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_NOC]))
+    wrapper->set_NoC_power(
+        n_icnt_mem_to_simt +
+        n_icnt_simt_to_mem);  // Number of flits traversing the interconnect
+                              // from Accel-Sim
+  else
+    wrapper->set_NoC_power(
+        std::stod(hw_data[HW_NOC]));  // Number of flits traversing the
+                                      // interconnect from HW
+
+  wrapper->compute();
+
+  wrapper->update_components_power();
+
+  wrapper->power_metrics_calculations();
+
+  wrapper->dump();
+  power_stats->l1r_hits_kernel = power_stats->get_l1d_read_hits(1);
+  power_stats->l1r_misses_kernel = power_stats->get_l1d_read_misses(1);
+  power_stats->l1w_hits_kernel = power_stats->get_l1d_write_hits(1);
+  power_stats->l1w_misses_kernel = power_stats->get_l1d_write_misses(1);
+  power_stats->shared_accesses_kernel = power_stats->get_const_accessess(1);
+  power_stats->cc_accesses_kernel = power_stats->get_shmem_access(1);
+  power_stats->dram_rd_kernel = power_stats->get_dram_rd(1);
+  power_stats->dram_wr_kernel = power_stats->get_dram_wr(1);
+  power_stats->dram_pre_kernel = power_stats->get_dram_pre(1);
+  power_stats->l1i_hits_kernel = power_stats->get_inst_c_hits(1);
+  power_stats->l1i_misses_kernel = power_stats->get_inst_c_misses(1);
+  power_stats->l2r_hits_kernel = power_stats->get_l2_read_hits(1);
+  power_stats->l2r_misses_kernel = power_stats->get_l2_read_misses(1);
+  power_stats->l2w_hits_kernel = power_stats->get_l2_write_hits(1);
+  power_stats->l2w_misses_kernel = power_stats->get_l2_write_misses(1);
+  power_stats->noc_tr_kernel = power_stats->get_icnt_simt_to_mem(1);
+  power_stats->noc_rc_kernel = power_stats->get_icnt_mem_to_simt(1);
+
+  power_stats->clear();
+}
\ No newline at end of file
diff --git a/src/gpgpu-sim/power_interface.h b/src/gpgpu-sim/power_interface.h
index 2bfd4d504..3c043e6b6 100644
--- a/src/gpgpu-sim/power_interface.h
+++ b/src/gpgpu-sim/power_interface.h
@@ -1,18 +1,21 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler
+// Hetherington, Vijay Kandiah, Nikos Hardavellas, Mahmoud Khairy, Junrui Pan,
+// Timothy G. Rogers The University of British Columbia, Northwestern
+// University, Purdue University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -43,7 +46,21 @@ void mcpat_cycle(const gpgpu_sim_config &config,
                  class gpgpu_sim_wrapper *wrapper,
                  class power_stat_t *power_stats, unsigned stat_sample_freq,
                  unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
-                 unsigned inst);
+                 unsigned inst, bool dvfs_enabled);
+
+void calculate_hw_mcpat(
+    const gpgpu_sim_config &config, const shader_core_config *shdr_config,
+    class gpgpu_sim_wrapper *wrapper, class power_stat_t *power_stats,
+    unsigned stat_sample_freq, unsigned tot_cycle, unsigned cycle,
+    unsigned tot_inst, unsigned inst, int power_simulation_mode,
+    bool dvfs_enabled, char *hwpowerfile, char *benchname,
+    std::string executed_kernelname,
+    const bool *accelwattch_hybrid_configuration, bool aggregate_power_stats);
+
+bool parse_hw_file(char *hwpowerfile, bool find_target_kernel,
+                   vector<string> &hw_data, char *benchname,
+                   std::string executed_kernelname);
+
 void mcpat_reset_perf_count(class gpgpu_sim_wrapper *wrapper);
 
 #endif /* POWER_INTERFACE_H_ */
diff --git a/src/gpgpu-sim/power_stat.cc b/src/gpgpu-sim/power_stat.cc
index 7b60ddf84..764652b9e 100644
--- a/src/gpgpu-sim/power_stat.cc
+++ b/src/gpgpu-sim/power_stat.cc
@@ -1,18 +1,21 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler
+// Hetherington, Vijay Kandiah, Nikos Hardavellas, Mahmoud Khairy, Junrui Pan,
+// Timothy G. Rogers The University of British Columbia, Northwestern
+// University, Purdue University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -54,10 +57,62 @@ power_mem_stat_t::power_mem_stat_t(const memory_config *mem_config,
   init();
 }
 
+void power_stat_t::clear() {
+  for (unsigned i = 0; i < NUM_STAT_IDX; ++i) {
+    pwr_mem_stat->core_cache_stats[i].clear();
+    pwr_mem_stat->l2_cache_stats[i].clear();
+    for (unsigned j = 0; j < m_config->num_shader(); ++j) {
+      pwr_core_stat->m_pipeline_duty_cycle[i][j] = 0;
+      pwr_core_stat->m_num_decoded_insn[i][j] = 0;
+      pwr_core_stat->m_num_FPdecoded_insn[i][j] = 0;
+      pwr_core_stat->m_num_INTdecoded_insn[i][j] = 0;
+      pwr_core_stat->m_num_storequeued_insn[i][j] = 0;
+      pwr_core_stat->m_num_loadqueued_insn[i][j] = 0;
+      pwr_core_stat->m_num_tex_inst[i][j] = 0;
+      pwr_core_stat->m_num_ialu_acesses[i][j] = 0;
+      pwr_core_stat->m_num_fp_acesses[i][j] = 0;
+      pwr_core_stat->m_num_imul_acesses[i][j] = 0;
+      pwr_core_stat->m_num_imul24_acesses[i][j] = 0;
+      pwr_core_stat->m_num_imul32_acesses[i][j] = 0;
+      pwr_core_stat->m_num_fpmul_acesses[i][j] = 0;
+      pwr_core_stat->m_num_idiv_acesses[i][j] = 0;
+      pwr_core_stat->m_num_fpdiv_acesses[i][j] = 0;
+      pwr_core_stat->m_num_dp_acesses[i][j] = 0;
+      pwr_core_stat->m_num_dpmul_acesses[i][j] = 0;
+      pwr_core_stat->m_num_dpdiv_acesses[i][j] = 0;
+      pwr_core_stat->m_num_tensor_core_acesses[i][j] = 0;
+      pwr_core_stat->m_num_const_acesses[i][j] = 0;
+      pwr_core_stat->m_num_tex_acesses[i][j] = 0;
+      pwr_core_stat->m_num_sp_acesses[i][j] = 0;
+      pwr_core_stat->m_num_sfu_acesses[i][j] = 0;
+      pwr_core_stat->m_num_sqrt_acesses[i][j] = 0;
+      pwr_core_stat->m_num_log_acesses[i][j] = 0;
+      pwr_core_stat->m_num_sin_acesses[i][j] = 0;
+      pwr_core_stat->m_num_exp_acesses[i][j] = 0;
+      pwr_core_stat->m_num_mem_acesses[i][j] = 0;
+      pwr_core_stat->m_num_sp_committed[i][j] = 0;
+      pwr_core_stat->m_num_sfu_committed[i][j] = 0;
+      pwr_core_stat->m_num_mem_committed[i][j] = 0;
+      pwr_core_stat->m_read_regfile_acesses[i][j] = 0;
+      pwr_core_stat->m_write_regfile_acesses[i][j] = 0;
+      pwr_core_stat->m_non_rf_operands[i][j] = 0;
+      pwr_core_stat->m_active_sp_lanes[i][j] = 0;
+      pwr_core_stat->m_active_sfu_lanes[i][j] = 0;
+      pwr_core_stat->m_active_exu_threads[i][j] = 0;
+      pwr_core_stat->m_active_exu_warps[i][j] = 0;
+    }
+    for (unsigned j = 0; j < m_mem_config->m_n_mem; ++j) {
+      pwr_mem_stat->n_rd[i][j] = 0;
+      pwr_mem_stat->n_wr[i][j] = 0;
+      pwr_mem_stat->n_pre[i][j] = 0;
+    }
+  }
+}
+
 void power_mem_stat_t::init() {
-  shmem_read_access[CURRENT_STAT_IDX] =
+  shmem_access[CURRENT_STAT_IDX] =
       m_core_stats->gpgpu_n_shmem_bank_access;  // Shared memory access
-  shmem_read_access[PREV_STAT_IDX] =
+  shmem_access[PREV_STAT_IDX] =
       (unsigned *)calloc(m_core_config->num_shader(), sizeof(unsigned));
 
   for (unsigned i = 0; i < NUM_STAT_IDX; ++i) {
@@ -71,6 +126,7 @@ void power_mem_stat_t::init() {
     n_pre[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
     n_rd[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
     n_wr[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
+    n_wr_WB[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
     n_req[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
 
     // Interconnect stats
@@ -86,8 +142,8 @@ void power_mem_stat_t::save_stats() {
   l2_cache_stats[PREV_STAT_IDX] = l2_cache_stats[CURRENT_STAT_IDX];
 
   for (unsigned i = 0; i < m_core_config->num_shader(); ++i) {
-    shmem_read_access[PREV_STAT_IDX][i] =
-        shmem_read_access[CURRENT_STAT_IDX][i];  // Shared memory access
+    shmem_access[PREV_STAT_IDX][i] =
+        shmem_access[CURRENT_STAT_IDX][i];  // Shared memory access
   }
 
   for (unsigned i = 0; i < m_config->m_n_mem; ++i) {
@@ -98,6 +154,7 @@ void power_mem_stat_t::save_stats() {
     n_pre[PREV_STAT_IDX][i] = n_pre[CURRENT_STAT_IDX][i];
     n_rd[PREV_STAT_IDX][i] = n_rd[CURRENT_STAT_IDX][i];
     n_wr[PREV_STAT_IDX][i] = n_wr[CURRENT_STAT_IDX][i];
+    n_wr_WB[PREV_STAT_IDX][i] = n_wr_WB[CURRENT_STAT_IDX][i];
     n_req[PREV_STAT_IDX][i] = n_req[CURRENT_STAT_IDX][i];
   }
 
@@ -117,17 +174,18 @@ void power_mem_stat_t::print(FILE *fout) const {
   unsigned total_mem_writes = 0;
   for (unsigned i = 0; i < m_config->m_n_mem; ++i) {
     total_mem_reads += n_rd[CURRENT_STAT_IDX][i];
-    total_mem_writes += n_wr[CURRENT_STAT_IDX][i];
+    total_mem_writes +=
+        n_wr[CURRENT_STAT_IDX][i] + n_wr_WB[CURRENT_STAT_IDX][i];
   }
   fprintf(fout, "Total memory controller accesses: %u\n",
           total_mem_reads + total_mem_writes);
   fprintf(fout, "Total memory controller reads: %u\n", total_mem_reads);
   fprintf(fout, "Total memory controller writes: %u\n", total_mem_writes);
-
+  // TODO: print_stats(require stream ID input)
   fprintf(fout, "Core cache stats:\n");
-  core_cache_stats->print_stats(fout);
+  core_cache_stats->print_stats(fout, -1);
   fprintf(fout, "L2 cache stats:\n");
-  l2_cache_stats->print_stats(fout);
+  l2_cache_stats->print_stats(fout, -1);
 }
 
 power_core_stat_t::power_core_stat_t(const shader_core_config *shader_config,
@@ -160,29 +218,47 @@ void power_core_stat_t::print(FILE *fout) {
             m_num_loadqueued_insn[CURRENT_STAT_IDX][i]);
     fprintf(fout, "\tTotal STORE Queued Instructions=%u\n",
             m_num_storequeued_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IALU Acesses=%u\n",
+    fprintf(fout, "\tTotal IALU Acesses=%f\n",
             m_num_ialu_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FP Acesses=%u\n",
+    fprintf(fout, "\tTotal FP Acesses=%f\n",
             m_num_fp_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IMUL Acesses=%u\n",
+    fprintf(fout, "\tTotal DP Acesses=%f\n",
+            m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal IMUL Acesses=%f\n",
             m_num_imul_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IMUL24 Acesses=%u\n",
+    fprintf(fout, "\tTotal IMUL24 Acesses=%f\n",
             m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IMUL32 Acesses=%u\n",
+    fprintf(fout, "\tTotal IMUL32 Acesses=%f\n",
             m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IDIV Acesses=%u\n",
+    fprintf(fout, "\tTotal IDIV Acesses=%f\n",
             m_num_idiv_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FPMUL Acesses=%u\n",
+    fprintf(fout, "\tTotal FPMUL Acesses=%f\n",
             m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SFU Acesses=%u\n",
-            m_num_trans_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FPDIV Acesses=%u\n",
+    fprintf(fout, "\tTotal DPMUL Acesses=%f\n",
+            m_num_dpmul_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal SQRT Acesses=%f\n",
+            m_num_sqrt_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal LOG Acesses=%f\n",
+            m_num_log_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal SIN Acesses=%f\n",
+            m_num_sin_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal EXP Acesses=%f\n",
+            m_num_exp_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal FPDIV Acesses=%f\n",
             m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SFU Acesses=%u\n",
+    fprintf(fout, "\tTotal DPDIV Acesses=%f\n",
+            m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal TENSOR Acesses=%f\n",
+            m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal CONST Acesses=%f\n",
+            m_num_const_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal TEX Acesses=%f\n",
+            m_num_tex_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal SFU Acesses=%f\n",
             m_num_sfu_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SP Acesses=%u\n",
+    fprintf(fout, "\tTotal SP Acesses=%f\n",
             m_num_sp_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal MEM Acesses=%u\n",
+    fprintf(fout, "\tTotal MEM Acesses=%f\n",
             m_num_mem_acesses[CURRENT_STAT_IDX][i]);
     fprintf(fout, "\tTotal SFU Commissions=%u\n",
             m_num_sfu_committed[CURRENT_STAT_IDX][i]);
@@ -214,9 +290,19 @@ void power_core_stat_t::init() {
   m_num_fpmul_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_fpmul_acesses;
   m_num_idiv_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_idiv_acesses;
   m_num_fpdiv_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_fpdiv_acesses;
+  m_num_dp_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_dp_acesses;
+  m_num_dpmul_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_dpmul_acesses;
+  m_num_dpdiv_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_dpdiv_acesses;
   m_num_sp_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_sp_acesses;
   m_num_sfu_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_sfu_acesses;
-  m_num_trans_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_trans_acesses;
+  m_num_sqrt_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_sqrt_acesses;
+  m_num_log_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_log_acesses;
+  m_num_sin_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_sin_acesses;
+  m_num_exp_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_exp_acesses;
+  m_num_tensor_core_acesses[CURRENT_STAT_IDX] =
+      m_core_stats->m_num_tensor_core_acesses;
+  m_num_const_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_const_acesses;
+  m_num_tex_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_tex_acesses;
   m_num_mem_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_mem_acesses;
   m_num_sp_committed[CURRENT_STAT_IDX] = m_core_stats->m_num_sp_committed;
   m_num_sfu_committed[CURRENT_STAT_IDX] = m_core_stats->m_num_sfu_committed;
@@ -228,6 +314,8 @@ void power_core_stat_t::init() {
   m_non_rf_operands[CURRENT_STAT_IDX] = m_core_stats->m_non_rf_operands;
   m_active_sp_lanes[CURRENT_STAT_IDX] = m_core_stats->m_active_sp_lanes;
   m_active_sfu_lanes[CURRENT_STAT_IDX] = m_core_stats->m_active_sfu_lanes;
+  m_active_exu_threads[CURRENT_STAT_IDX] = m_core_stats->m_active_exu_threads;
+  m_active_exu_warps[CURRENT_STAT_IDX] = m_core_stats->m_active_exu_warps;
   m_num_tex_inst[CURRENT_STAT_IDX] = m_core_stats->m_num_tex_inst;
 
   m_pipeline_duty_cycle[PREV_STAT_IDX] =
@@ -242,32 +330,51 @@ void power_core_stat_t::init() {
       (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
   m_num_loadqueued_insn[PREV_STAT_IDX] =
       (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_ialu_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_fp_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
   m_num_tex_inst[PREV_STAT_IDX] =
       (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+
+  m_num_ialu_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_fp_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
   m_num_imul_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+      (double *)calloc(m_config->num_shader(), sizeof(double));
   m_num_imul24_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+      (double *)calloc(m_config->num_shader(), sizeof(double));
   m_num_imul32_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+      (double *)calloc(m_config->num_shader(), sizeof(double));
   m_num_fpmul_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+      (double *)calloc(m_config->num_shader(), sizeof(double));
   m_num_idiv_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+      (double *)calloc(m_config->num_shader(), sizeof(double));
   m_num_fpdiv_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_dp_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_dpmul_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_dpdiv_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_tensor_core_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_const_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_tex_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
   m_num_sp_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+      (double *)calloc(m_config->num_shader(), sizeof(double));
   m_num_sfu_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_trans_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_sqrt_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_log_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_sin_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_exp_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
   m_num_mem_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+      (double *)calloc(m_config->num_shader(), sizeof(double));
   m_num_sp_committed[PREV_STAT_IDX] =
       (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
   m_num_sfu_committed[PREV_STAT_IDX] =
@@ -284,6 +391,10 @@ void power_core_stat_t::init() {
       (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
   m_active_sfu_lanes[PREV_STAT_IDX] =
       (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+  m_active_exu_threads[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_active_exu_warps[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
 }
 
 void power_core_stat_t::save_stats() {
@@ -319,8 +430,25 @@ void power_core_stat_t::save_stats() {
     m_num_sp_acesses[PREV_STAT_IDX][i] = m_num_sp_acesses[CURRENT_STAT_IDX][i];
     m_num_sfu_acesses[PREV_STAT_IDX][i] =
         m_num_sfu_acesses[CURRENT_STAT_IDX][i];
-    m_num_trans_acesses[PREV_STAT_IDX][i] =
-        m_num_trans_acesses[CURRENT_STAT_IDX][i];
+    m_num_sqrt_acesses[PREV_STAT_IDX][i] =
+        m_num_sqrt_acesses[CURRENT_STAT_IDX][i];
+    m_num_log_acesses[PREV_STAT_IDX][i] =
+        m_num_log_acesses[CURRENT_STAT_IDX][i];
+    m_num_sin_acesses[PREV_STAT_IDX][i] =
+        m_num_sin_acesses[CURRENT_STAT_IDX][i];
+    m_num_exp_acesses[PREV_STAT_IDX][i] =
+        m_num_exp_acesses[CURRENT_STAT_IDX][i];
+    m_num_dp_acesses[PREV_STAT_IDX][i] = m_num_dp_acesses[CURRENT_STAT_IDX][i];
+    m_num_dpmul_acesses[PREV_STAT_IDX][i] =
+        m_num_dpmul_acesses[CURRENT_STAT_IDX][i];
+    m_num_dpdiv_acesses[PREV_STAT_IDX][i] =
+        m_num_dpdiv_acesses[CURRENT_STAT_IDX][i];
+    m_num_tensor_core_acesses[PREV_STAT_IDX][i] =
+        m_num_tensor_core_acesses[CURRENT_STAT_IDX][i];
+    m_num_const_acesses[PREV_STAT_IDX][i] =
+        m_num_const_acesses[CURRENT_STAT_IDX][i];
+    m_num_tex_acesses[PREV_STAT_IDX][i] =
+        m_num_tex_acesses[CURRENT_STAT_IDX][i];
     m_num_mem_acesses[PREV_STAT_IDX][i] =
         m_num_mem_acesses[CURRENT_STAT_IDX][i];
     m_num_sp_committed[PREV_STAT_IDX][i] =
@@ -339,6 +467,10 @@ void power_core_stat_t::save_stats() {
         m_active_sp_lanes[CURRENT_STAT_IDX][i];
     m_active_sfu_lanes[PREV_STAT_IDX][i] =
         m_active_sfu_lanes[CURRENT_STAT_IDX][i];
+    m_active_exu_threads[PREV_STAT_IDX][i] =
+        m_active_exu_threads[CURRENT_STAT_IDX][i];
+    m_active_exu_warps[PREV_STAT_IDX][i] =
+        m_active_exu_warps[CURRENT_STAT_IDX][i];
   }
 }
 
@@ -356,6 +488,51 @@ power_stat_t::power_stat_t(const shader_core_config *shader_config,
   m_active_sms = active_sms;
   m_config = shader_config;
   m_mem_config = mem_config;
+  l1r_hits_kernel = 0;
+  l1r_misses_kernel = 0;
+  l1w_hits_kernel = 0;
+  l1w_misses_kernel = 0;
+  shared_accesses_kernel = 0;
+  cc_accesses_kernel = 0;
+  dram_rd_kernel = 0;
+  dram_wr_kernel = 0;
+  dram_pre_kernel = 0;
+  l1i_hits_kernel = 0;
+  l1i_misses_kernel = 0;
+  l2r_hits_kernel = 0;
+  l2r_misses_kernel = 0;
+  l2w_hits_kernel = 0;
+  l2w_misses_kernel = 0;
+  noc_tr_kernel = 0;
+  noc_rc_kernel = 0;
+
+  tot_inst_execution = 0;
+  tot_int_inst_execution = 0;
+  tot_fp_inst_execution = 0;
+  commited_inst_execution = 0;
+  ialu_acc_execution = 0;
+  imul24_acc_execution = 0;
+  imul32_acc_execution = 0;
+  imul_acc_execution = 0;
+  idiv_acc_execution = 0;
+  dp_acc_execution = 0;
+  dpmul_acc_execution = 0;
+  dpdiv_acc_execution = 0;
+  fp_acc_execution = 0;
+  fpmul_acc_execution = 0;
+  fpdiv_acc_execution = 0;
+  sqrt_acc_execution = 0;
+  log_acc_execution = 0;
+  sin_acc_execution = 0;
+  exp_acc_execution = 0;
+  tensor_acc_execution = 0;
+  tex_acc_execution = 0;
+  tot_fpu_acc_execution = 0;
+  tot_sfu_acc_execution = 0;
+  tot_threads_acc_execution = 0;
+  tot_warps_acc_execution = 0;
+  sp_active_lanes_execution = 0;
+  sfu_active_lanes_execution = 0;
 }
 
 void power_stat_t::visualizer_print(gzFile visualizer_file) {
diff --git a/src/gpgpu-sim/power_stat.h b/src/gpgpu-sim/power_stat.h
index c469db3b3..13f144ab4 100644
--- a/src/gpgpu-sim/power_stat.h
+++ b/src/gpgpu-sim/power_stat.h
@@ -1,18 +1,21 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler
+// Hetherington, Vijay Kandiah, Nikos Hardavellas, Mahmoud Khairy, Junrui Pan,
+// Timothy G. Rogers The University of British Columbia, Northwestern
+// University, Purdue University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -53,24 +56,35 @@ struct shader_core_power_stats_pod {
                                              // by this shader core
   unsigned *m_num_storequeued_insn[NUM_STAT_IDX];
   unsigned *m_num_loadqueued_insn[NUM_STAT_IDX];
-  unsigned *m_num_ialu_acesses[NUM_STAT_IDX];
-  unsigned *m_num_fp_acesses[NUM_STAT_IDX];
   unsigned *m_num_tex_inst[NUM_STAT_IDX];
-  unsigned *m_num_imul_acesses[NUM_STAT_IDX];
-  unsigned *m_num_imul32_acesses[NUM_STAT_IDX];
-  unsigned *m_num_imul24_acesses[NUM_STAT_IDX];
-  unsigned *m_num_fpmul_acesses[NUM_STAT_IDX];
-  unsigned *m_num_idiv_acesses[NUM_STAT_IDX];
-  unsigned *m_num_fpdiv_acesses[NUM_STAT_IDX];
-  unsigned *m_num_sp_acesses[NUM_STAT_IDX];
-  unsigned *m_num_sfu_acesses[NUM_STAT_IDX];
-  unsigned *m_num_trans_acesses[NUM_STAT_IDX];
-  unsigned *m_num_mem_acesses[NUM_STAT_IDX];
+  double *m_num_ialu_acesses[NUM_STAT_IDX];
+  double *m_num_fp_acesses[NUM_STAT_IDX];
+  double *m_num_imul_acesses[NUM_STAT_IDX];
+  double *m_num_imul32_acesses[NUM_STAT_IDX];
+  double *m_num_imul24_acesses[NUM_STAT_IDX];
+  double *m_num_fpmul_acesses[NUM_STAT_IDX];
+  double *m_num_idiv_acesses[NUM_STAT_IDX];
+  double *m_num_fpdiv_acesses[NUM_STAT_IDX];
+  double *m_num_dp_acesses[NUM_STAT_IDX];
+  double *m_num_dpmul_acesses[NUM_STAT_IDX];
+  double *m_num_dpdiv_acesses[NUM_STAT_IDX];
+  double *m_num_sp_acesses[NUM_STAT_IDX];
+  double *m_num_sfu_acesses[NUM_STAT_IDX];
+  double *m_num_sqrt_acesses[NUM_STAT_IDX];
+  double *m_num_log_acesses[NUM_STAT_IDX];
+  double *m_num_sin_acesses[NUM_STAT_IDX];
+  double *m_num_exp_acesses[NUM_STAT_IDX];
+  double *m_num_tensor_core_acesses[NUM_STAT_IDX];
+  double *m_num_const_acesses[NUM_STAT_IDX];
+  double *m_num_tex_acesses[NUM_STAT_IDX];
+  double *m_num_mem_acesses[NUM_STAT_IDX];
   unsigned *m_num_sp_committed[NUM_STAT_IDX];
   unsigned *m_num_sfu_committed[NUM_STAT_IDX];
   unsigned *m_num_mem_committed[NUM_STAT_IDX];
   unsigned *m_active_sp_lanes[NUM_STAT_IDX];
   unsigned *m_active_sfu_lanes[NUM_STAT_IDX];
+  double *m_active_exu_threads[NUM_STAT_IDX];
+  double *m_active_exu_warps[NUM_STAT_IDX];
   unsigned *m_read_regfile_acesses[NUM_STAT_IDX];
   unsigned *m_write_regfile_acesses[NUM_STAT_IDX];
   unsigned *m_non_rf_operands[NUM_STAT_IDX];
@@ -96,8 +110,7 @@ struct mem_power_stats_pod {
   class cache_stats core_cache_stats[NUM_STAT_IDX];  // Total core stats
   class cache_stats l2_cache_stats[NUM_STAT_IDX];    // Total L2 partition stats
 
-  unsigned *shmem_read_access[NUM_STAT_IDX];  // Shared memory access
-
+  unsigned *shmem_access[NUM_STAT_IDX];  // Shared memory access
   // Low level DRAM stats
   unsigned *n_cmd[NUM_STAT_IDX];
   unsigned *n_activity[NUM_STAT_IDX];
@@ -106,6 +119,7 @@ struct mem_power_stats_pod {
   unsigned *n_pre[NUM_STAT_IDX];
   unsigned *n_rd[NUM_STAT_IDX];
   unsigned *n_wr[NUM_STAT_IDX];
+  unsigned *n_wr_WB[NUM_STAT_IDX];
   unsigned *n_req[NUM_STAT_IDX];
 
   // Interconnect stats
@@ -144,34 +158,90 @@ class power_stat_t {
     *m_average_pipeline_duty_cycle = 0;
     *m_active_sms = 0;
   }
-
-  unsigned get_total_inst() {
-    unsigned total_inst = 0;
+  void clear();
+  unsigned l1i_misses_kernel;
+  unsigned l1i_hits_kernel;
+  unsigned long long l1r_hits_kernel;
+  unsigned long long l1r_misses_kernel;
+  unsigned long long l1w_hits_kernel;
+  unsigned long long l1w_misses_kernel;
+  unsigned long long shared_accesses_kernel;
+  unsigned long long cc_accesses_kernel;
+  unsigned long long dram_rd_kernel;
+  unsigned long long dram_wr_kernel;
+  unsigned long long dram_pre_kernel;
+  unsigned long long l2r_hits_kernel;
+  unsigned long long l2r_misses_kernel;
+  unsigned long long l2w_hits_kernel;
+  unsigned long long l2w_misses_kernel;
+  unsigned long long noc_tr_kernel;
+  unsigned long long noc_rc_kernel;
+  unsigned long long tot_inst_execution;
+  unsigned long long tot_int_inst_execution;
+  unsigned long long tot_fp_inst_execution;
+  unsigned long long commited_inst_execution;
+  unsigned long long ialu_acc_execution;
+  unsigned long long imul24_acc_execution;
+  unsigned long long imul32_acc_execution;
+  unsigned long long imul_acc_execution;
+  unsigned long long idiv_acc_execution;
+  unsigned long long dp_acc_execution;
+  unsigned long long dpmul_acc_execution;
+  unsigned long long dpdiv_acc_execution;
+  unsigned long long fp_acc_execution;
+  unsigned long long fpmul_acc_execution;
+  unsigned long long fpdiv_acc_execution;
+  unsigned long long sqrt_acc_execution;
+  unsigned long long log_acc_execution;
+  unsigned long long sin_acc_execution;
+  unsigned long long exp_acc_execution;
+  unsigned long long tensor_acc_execution;
+  unsigned long long tex_acc_execution;
+  unsigned long long tot_fpu_acc_execution;
+  unsigned long long tot_sfu_acc_execution;
+  unsigned long long tot_threads_acc_execution;
+  unsigned long long tot_warps_acc_execution;
+  unsigned long long sp_active_lanes_execution;
+  unsigned long long sfu_active_lanes_execution;
+  double get_total_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_decoded_insn[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_decoded_insn[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_decoded_insn[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_decoded_insn[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_decoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_total_int_inst() {
-    unsigned total_inst = 0;
+  double get_total_int_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst +=
-          (pwr_core_stat->m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]) -
-          (pwr_core_stat->m_num_INTdecoded_insn[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_INTdecoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_total_fp_inst() {
-    unsigned total_inst = 0;
+  double get_total_fp_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_FPdecoded_insn[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_FPdecoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_total_load_inst() {
-    unsigned total_inst = 0;
+  double get_total_load_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst +=
           (pwr_core_stat->m_num_loadqueued_insn[CURRENT_STAT_IDX][i]) -
@@ -179,8 +249,8 @@ class power_stat_t {
     }
     return total_inst;
   }
-  unsigned get_total_store_inst() {
-    unsigned total_inst = 0;
+  double get_total_store_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst +=
           (pwr_core_stat->m_num_storequeued_insn[CURRENT_STAT_IDX][i]) -
@@ -188,57 +258,72 @@ class power_stat_t {
     }
     return total_inst;
   }
-  unsigned get_sp_committed_inst() {
-    unsigned total_inst = 0;
+  double get_sp_committed_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sp_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sp_committed[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_sfu_committed_inst() {
-    unsigned total_inst = 0;
+  double get_sfu_committed_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sfu_committed[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_mem_committed_inst() {
-    unsigned total_inst = 0;
+  double get_mem_committed_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_mem_committed[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_committed_inst() {
-    unsigned total_inst = 0;
+  double get_committed_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_mem_committed[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_sfu_committed[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_sp_committed[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_sp_committed[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sp_committed[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_mem_committed[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_sfu_committed[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sp_committed[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_sp_committed[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_regfile_reads() {
-    unsigned total_inst = 0;
+  double get_regfile_reads(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst +=
-          (pwr_core_stat->m_read_regfile_acesses[CURRENT_STAT_IDX][i]) -
-          (pwr_core_stat->m_read_regfile_acesses[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_read_regfile_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_read_regfile_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_read_regfile_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_regfile_writes() {
-    unsigned total_inst = 0;
+  double get_regfile_writes(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst +=
-          (pwr_core_stat->m_write_regfile_acesses[CURRENT_STAT_IDX][i]) -
-          (pwr_core_stat->m_write_regfile_acesses[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_write_regfile_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_write_regfile_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_write_regfile_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
@@ -253,17 +338,20 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_non_regfile_operands() {
-    unsigned total_inst = 0;
+  double get_non_regfile_operands(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_non_rf_operands[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_non_rf_operands[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_sp_accessess() {
-    unsigned total_inst = 0;
+  double get_sp_accessess() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sp_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sp_acesses[PREV_STAT_IDX][i]);
@@ -271,25 +359,62 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_sfu_accessess() {
-    unsigned total_inst = 0;
+  double get_sfu_accessess() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sfu_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sfu_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_trans_accessess() {
-    unsigned total_inst = 0;
+
+  double get_sqrt_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_trans_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_trans_acesses[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_sqrt_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+  double get_log_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_log_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+  double get_sin_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_sin_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+  double get_exp_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_exp_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_mem_accessess() {
-    unsigned total_inst = 0;
+  double get_mem_accessess() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_mem_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_mem_acesses[PREV_STAT_IDX][i]);
@@ -297,66 +422,175 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_intdiv_accessess() {
-    unsigned total_inst = 0;
+  double get_intdiv_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_idiv_acesses[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_idiv_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_fpdiv_accessess() {
-    unsigned total_inst = 0;
+  double get_fpdiv_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_intmul32_accessess() {
-    unsigned total_inst = 0;
+  double get_intmul32_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_intmul24_accessess() {
-    unsigned total_inst = 0;
+  double get_intmul24_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_intmul_accessess() {
-    unsigned total_inst = 0;
+  double get_intmul_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_fpmul_accessess() {
-    unsigned total_inst = 0;
+  double get_fpmul_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  float get_sp_active_lanes() {
-    unsigned total_inst = 0;
+  double get_fp_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_dp_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_dp_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_dpmul_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_dpmul_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_dpdiv_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_dpdiv_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_tensor_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_tensor_core_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_const_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += pwr_core_stat->m_num_const_acesses[CURRENT_STAT_IDX][i];
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_const_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_const_acesses[PREV_STAT_IDX][i]);
+    }
+    return (total_inst);
+  }
+
+  double get_tex_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_tex_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_sp_active_lanes() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_active_sp_lanes[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_active_sp_lanes[PREV_STAT_IDX][i]);
@@ -365,7 +599,7 @@ class power_stat_t {
   }
 
   float get_sfu_active_lanes() {
-    unsigned total_inst = 0;
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_active_sfu_lanes[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_active_sfu_lanes[PREV_STAT_IDX][i]);
@@ -375,49 +609,142 @@ class power_stat_t {
            m_config->gpgpu_num_sfu_units;
   }
 
-  unsigned get_tot_fpu_accessess() {
-    unsigned total_inst = 0;
+  float get_active_threads(bool aggregate_stat) {
+    unsigned total_threads = 0;
+    unsigned total_warps = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]);
+      if (aggregate_stat) {
+        total_threads +=
+            (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]);
+        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]);
+      } else {
+        total_threads +=
+            (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_active_exu_threads[PREV_STAT_IDX][i]);
+        total_warps +=
+            (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_active_exu_warps[PREV_STAT_IDX][i]);
+      }
     }
-    total_inst +=
-        get_total_load_inst() + get_total_store_inst() + get_tex_inst();
+    if (total_warps != 0)
+      return (float)((float)total_threads / (float)total_warps);
+    else
+      return 0;
+  }
+
+  unsigned long long get_tot_threads_kernel(bool aggregate_stat) {
+    unsigned total_threads = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat) {
+        total_threads +=
+            (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]);
+      } else {
+        total_threads +=
+            (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_active_exu_threads[PREV_STAT_IDX][i]);
+      }
+    }
+
+    return total_threads;
+  }
+  unsigned long long get_tot_warps_kernel(bool aggregate_stat) {
+    unsigned long long total_warps = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat) {
+        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]);
+      } else {
+        total_warps +=
+            (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_active_exu_warps[PREV_STAT_IDX][i]);
+      }
+    }
+    return total_warps;
+  }
+
+  double get_tot_fpu_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) +
+                      (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]) +
+                      (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_dp_acesses[PREV_STAT_IDX][i]);
+    }
+    // total_inst +=
+    // get_total_load_inst()+get_total_store_inst()+get_tex_inst();
     return total_inst;
   }
 
-  unsigned get_tot_sfu_accessess() {
-    unsigned total_inst = 0;
+  double get_tot_sfu_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_idiv_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_trans_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_trans_acesses[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_idiv_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_sqrt_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_log_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_sin_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_exp_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_dpmul_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_dpdiv_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_tensor_core_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_tex_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_ialu_accessess() {
-    unsigned total_inst = 0;
+  double get_ialu_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_ialu_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_ialu_acesses[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_ialu_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_ialu_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_ialu_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_tex_inst() {
-    unsigned total_inst = 0;
+  double get_tex_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_tex_inst[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_tex_inst[PREV_STAT_IDX][i]);
@@ -425,7 +752,7 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_constant_c_accesses() {
+  double get_constant_c_accesses() {
     enum mem_access_type access_type[] = {CONST_ACC_R};
     enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
     unsigned num_access_type =
@@ -440,7 +767,7 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_constant_c_misses() {
+  double get_constant_c_misses() {
     enum mem_access_type access_type[] = {CONST_ACC_R};
     enum cache_request_status request_status[] = {MISS};
     unsigned num_access_type =
@@ -455,10 +782,10 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_constant_c_hits() {
+  double get_constant_c_hits() {
     return (get_constant_c_accesses() - get_constant_c_misses());
   }
-  unsigned get_texture_c_accesses() {
+  double get_texture_c_accesses() {
     enum mem_access_type access_type[] = {TEXTURE_ACC_R};
     enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
     unsigned num_access_type =
@@ -473,7 +800,7 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_texture_c_misses() {
+  double get_texture_c_misses() {
     enum mem_access_type access_type[] = {TEXTURE_ACC_R};
     enum cache_request_status request_status[] = {MISS};
     unsigned num_access_type =
@@ -488,205 +815,257 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_texture_c_hits() {
+  double get_texture_c_hits() {
     return (get_texture_c_accesses() - get_texture_c_misses());
   }
-  unsigned get_inst_c_accesses() {
+  double get_inst_c_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {INST_ACC_R};
     enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-  }
-  unsigned get_inst_c_misses() {
+    if (aggregate_stat)
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+          access_type, num_access_type, request_status, num_request_status));
+    else
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
+  }
+  double get_inst_c_misses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {INST_ACC_R};
     enum cache_request_status request_status[] = {MISS};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-  }
-  unsigned get_inst_c_hits() {
-    return (get_inst_c_accesses() - get_inst_c_misses());
+    if (aggregate_stat)
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+          access_type, num_access_type, request_status, num_request_status));
+    else
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
+  }
+  double get_inst_c_hits(bool aggregate_stat) {
+    return (get_inst_c_accesses(aggregate_stat) -
+            get_inst_c_misses(aggregate_stat));
   }
 
-  unsigned get_l1d_read_accesses() {
+  double get_l1d_read_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_R, LOCAL_ACC_R};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, MISS, SECTOR_MISS};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
+    if (aggregate_stat) {
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
+    }
+  }
+  double get_l1d_read_misses(bool aggregate_stat) {
+    return (get_l1d_read_accesses(aggregate_stat) -
+            get_l1d_read_hits(aggregate_stat));
   }
-  unsigned get_l1d_read_misses() {
+  double get_l1d_read_hits(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_R, LOCAL_ACC_R};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] = {HIT, MSHR_HIT};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-  }
-  unsigned get_l1d_read_hits() {
-    return (get_l1d_read_accesses() - get_l1d_read_misses());
+    if (aggregate_stat) {
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
+    }
   }
-  unsigned get_l1d_write_accesses() {
+  double get_l1d_write_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, MISS, SECTOR_MISS};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
+    if (aggregate_stat) {
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
+    }
+  }
+  double get_l1d_write_misses(bool aggregate_stat) {
+    return (get_l1d_write_accesses(aggregate_stat) -
+            get_l1d_write_hits(aggregate_stat));
   }
-  unsigned get_l1d_write_misses() {
+  double get_l1d_write_hits(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] = {HIT, MSHR_HIT};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-  }
-  unsigned get_l1d_write_hits() {
-    return (get_l1d_write_accesses() - get_l1d_write_misses());
+    if (aggregate_stat) {
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
+    }
   }
-  unsigned get_cache_misses() {
-    return get_l1d_read_misses() + get_constant_c_misses() +
-           get_l1d_write_misses() + get_texture_c_misses();
+  double get_cache_misses() {
+    return get_l1d_read_misses(0) + get_constant_c_misses() +
+           get_l1d_write_misses(0) + get_texture_c_misses();
   }
 
-  unsigned get_cache_read_misses() {
-    return get_l1d_read_misses() + get_constant_c_misses() +
+  double get_cache_read_misses() {
+    return get_l1d_read_misses(0) + get_constant_c_misses() +
            get_texture_c_misses();
   }
 
-  unsigned get_cache_write_misses() { return get_l1d_write_misses(); }
+  double get_cache_write_misses() { return get_l1d_write_misses(0); }
 
-  unsigned get_shmem_read_access() {
+  double get_shmem_access(bool aggregate_stat) {
     unsigned total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_mem_stat->shmem_read_access[CURRENT_STAT_IDX][i]) -
-                    (pwr_mem_stat->shmem_read_access[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst += (pwr_mem_stat->shmem_access[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_mem_stat->shmem_access[CURRENT_STAT_IDX][i]) -
+                      (pwr_mem_stat->shmem_access[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_l2_read_accesses() {
+  unsigned long long get_l2_read_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {
         GLOBAL_ACC_R, LOCAL_ACC_R, CONST_ACC_R, TEXTURE_ACC_R, INST_ACC_R};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED, MISS,
+                                                  SECTOR_MISS};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
+    if (aggregate_stat) {
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
+    }
+  }
 
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
+  unsigned long long get_l2_read_misses(bool aggregate_stat) {
+    return (get_l2_read_accesses(aggregate_stat) -
+            get_l2_read_hits(aggregate_stat));
   }
 
-  unsigned get_l2_read_misses() {
+  unsigned long long get_l2_read_hits(bool aggregate_stat) {
     enum mem_access_type access_type[] = {
         GLOBAL_ACC_R, LOCAL_ACC_R, CONST_ACC_R, TEXTURE_ACC_R, INST_ACC_R};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-  }
-
-  unsigned get_l2_read_hits() {
-    return (get_l2_read_accesses() - get_l2_read_misses());
+    if (aggregate_stat) {
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
+    }
   }
 
-  unsigned get_l2_write_accesses() {
+  unsigned long long get_l2_write_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W,
                                           L1_WRBK_ACC};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED, MISS,
+                                                  SECTOR_MISS};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
+    if (aggregate_stat) {
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
+    }
   }
 
-  unsigned get_l2_write_misses() {
+  unsigned long long get_l2_write_misses(bool aggregate_stat) {
+    return (get_l2_write_accesses(aggregate_stat) -
+            get_l2_write_hits(aggregate_stat));
+  }
+  unsigned long long get_l2_write_hits(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W,
                                           L1_WRBK_ACC};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-  }
-  unsigned get_l2_write_hits() {
-    return (get_l2_write_accesses() - get_l2_write_misses());
+    if (aggregate_stat) {
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
+    }
   }
-  unsigned get_dram_cmd() {
+  double get_dram_cmd() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_cmd[CURRENT_STAT_IDX][i] -
@@ -694,7 +1073,7 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_activity() {
+  double get_dram_activity() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_activity[CURRENT_STAT_IDX][i] -
@@ -702,7 +1081,7 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_nop() {
+  double get_dram_nop() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_nop[CURRENT_STAT_IDX][i] -
@@ -710,7 +1089,7 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_act() {
+  double get_dram_act() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_act[CURRENT_STAT_IDX][i] -
@@ -718,31 +1097,46 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_pre() {
+  double get_dram_pre(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      total += (pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i] -
-                pwr_mem_stat->n_pre[PREV_STAT_IDX][i]);
+      if (aggregate_stat) {
+        total += pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i];
+      } else {
+        total += (pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i] -
+                  pwr_mem_stat->n_pre[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
-  unsigned get_dram_rd() {
+  double get_dram_rd(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      total += (pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i] -
-                pwr_mem_stat->n_rd[PREV_STAT_IDX][i]);
+      if (aggregate_stat) {
+        total += pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i];
+      } else {
+        total += (pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i] -
+                  pwr_mem_stat->n_rd[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
-  unsigned get_dram_wr() {
+  double get_dram_wr(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      total += (pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] -
-                pwr_mem_stat->n_wr[PREV_STAT_IDX][i]);
+      if (aggregate_stat) {
+        total += pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] +
+                 pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i];
+      } else {
+        total += (pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] -
+                  pwr_mem_stat->n_wr[PREV_STAT_IDX][i]) +
+                 (pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i] -
+                  pwr_mem_stat->n_wr_WB[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
-  unsigned get_dram_req() {
+  double get_dram_req() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_req[CURRENT_STAT_IDX][i] -
@@ -751,20 +1145,30 @@ class power_stat_t {
     return total;
   }
 
-  long get_icnt_simt_to_mem() {
+  unsigned long long get_icnt_simt_to_mem(bool aggregate_stat) {
     long total = 0;
     for (unsigned i = 0; i < m_config->n_simt_clusters; ++i) {
-      total += (pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i] -
-                pwr_mem_stat->n_simt_to_mem[PREV_STAT_IDX][i]);
+      if (aggregate_stat) {
+        total += pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i];
+      } else {
+        total += (pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i] -
+                  pwr_mem_stat->n_simt_to_mem[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
 
-  long get_icnt_mem_to_simt() {
+  unsigned long long get_icnt_mem_to_simt(bool aggregate_stat) {
     long total = 0;
     for (unsigned i = 0; i < m_config->n_simt_clusters; ++i) {
-      total += (pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i] -
-                pwr_mem_stat->n_mem_to_simt[PREV_STAT_IDX][i]);
+      if (aggregate_stat) {
+        total += pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i];
+      }
+
+      else {
+        total += (pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i] -
+                  pwr_mem_stat->n_mem_to_simt[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c6e7b8f67..7482e0ef9 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1,19 +1,22 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// George L. Yuan, Andrew Turner, Inderpreet Singh
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
+// George L. Yuan, Andrew Turner, Inderpreet Singh, Vijay Kandiah, Nikos
+// Hardavellas, Mahmoud Khairy, Junrui Pan, Timothy G. Rogers The University of
+// British Columbia, Northwestern University, Purdue University All rights
+// reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -54,11 +57,25 @@
 
 mem_fetch *shader_core_mem_fetch_allocator::alloc(
     new_addr_type addr, mem_access_type type, unsigned size, bool wr,
-    unsigned long long cycle) const {
+    unsigned long long cycle, unsigned long long streamID) const {
   mem_access_t access(type, addr, size, wr, m_memory_config->gpgpu_ctx);
-  mem_fetch *mf =
-      new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, -1,
-                    m_core_id, m_cluster_id, m_memory_config, cycle);
+  mem_fetch *mf = new mem_fetch(
+      access, NULL, streamID, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, -1,
+      m_core_id, m_cluster_id, m_memory_config, cycle);
+  return mf;
+}
+
+mem_fetch *shader_core_mem_fetch_allocator::alloc(
+    new_addr_type addr, mem_access_type type, const active_mask_t &active_mask,
+    const mem_access_byte_mask_t &byte_mask,
+    const mem_access_sector_mask_t &sector_mask, unsigned size, bool wr,
+    unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
+    mem_fetch *original_mf, unsigned long long streamID) const {
+  mem_access_t access(type, addr, size, wr, active_mask, byte_mask, sector_mask,
+                      m_memory_config->gpgpu_ctx);
+  mem_fetch *mf = new mem_fetch(
+      access, NULL, streamID, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid,
+      m_core_id, m_cluster_id, m_memory_config, cycle, original_mf);
   return mf;
 }
 /////////////////////////////////////////////////////////////////////////////
@@ -91,7 +108,7 @@ void shader_core_ctx::create_front_pipeline() {
     m_pipeline_reg.push_back(
         register_set(m_config->pipe_widths[j], pipeline_stage_name_decode[j]));
   }
-  for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
+  for (unsigned j = 0; j < m_config->m_specialized_unit.size(); j++) {
     m_pipeline_reg.push_back(
         register_set(m_config->m_specialized_unit[j].id_oc_spec_reg_width,
                      m_config->m_specialized_unit[j].name));
@@ -99,7 +116,7 @@ void shader_core_ctx::create_front_pipeline() {
     m_specilized_dispatch_reg.push_back(
         &m_pipeline_reg[m_pipeline_reg.size() - 1]);
   }
-  for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
+  for (unsigned j = 0; j < m_config->m_specialized_unit.size(); j++) {
     m_pipeline_reg.push_back(
         register_set(m_config->m_specialized_unit[j].oc_ex_spec_reg_width,
                      m_config->m_specialized_unit[j].name));
@@ -108,7 +125,7 @@ void shader_core_ctx::create_front_pipeline() {
 
   if (m_config->sub_core_model) {
     // in subcore model, each scheduler should has its own issue register, so
-    // num scheduler = reg width
+    // ensure num scheduler = reg width
     assert(m_config->gpgpu_num_sched_per_core ==
            m_pipeline_reg[ID_OC_SP].get_size());
     assert(m_config->gpgpu_num_sched_per_core ==
@@ -124,6 +141,11 @@ void shader_core_ctx::create_front_pipeline() {
     if (m_config->gpgpu_num_int_units > 0)
       assert(m_config->gpgpu_num_sched_per_core ==
              m_pipeline_reg[ID_OC_INT].get_size());
+    for (unsigned j = 0; j < m_config->m_specialized_unit.size(); j++) {
+      if (m_config->m_specialized_unit[j].num_units > 0)
+        assert(m_config->gpgpu_num_sched_per_core ==
+               m_config->m_specialized_unit[j].id_oc_spec_reg_width);
+    }
   }
 
   m_threadState = (thread_ctx_t *)calloc(sizeof(thread_ctx_t),
@@ -140,7 +162,10 @@ void shader_core_ctx::create_front_pipeline() {
   }
 
   // m_icnt = new shader_memory_interface(this,cluster);
-  if (m_config->gpgpu_perfect_mem) {
+  if (m_memory_config->SST_mode) {
+    m_icnt = new sst_memory_interface(
+        this, static_cast<sst_simt_core_cluster *>(m_cluster));
+  } else if (m_config->gpgpu_perfect_mem) {
     m_icnt = new perfect_memory_interface(this, m_cluster);
   } else {
     m_icnt = new shader_memory_interface(this, m_cluster);
@@ -156,7 +181,7 @@ void shader_core_ctx::create_front_pipeline() {
   snprintf(name, STRSIZE, "L1I_%03d", m_sid);
   m_L1I = new read_only_cache(name, m_config->m_L1I_config, m_sid,
                               get_shader_instruction_cache_id(), m_icnt,
-                              IN_L1I_MISS_QUEUE);
+                              IN_L1I_MISS_QUEUE, OTHER_GPU_CACHE, m_gpu);
 }
 
 void shader_core_ctx::create_schedulers() {
@@ -166,18 +191,16 @@ void shader_core_ctx::create_schedulers() {
   // must currently occur after all inputs have been initialized.
   std::string sched_config = m_config->gpgpu_scheduler_string;
   const concrete_scheduler scheduler =
-      sched_config.find("lrr") != std::string::npos
-          ? CONCRETE_SCHEDULER_LRR
-          : sched_config.find("two_level_active") != std::string::npos
-                ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
-                : sched_config.find("gto") != std::string::npos
-                      ? CONCRETE_SCHEDULER_GTO
-                      : sched_config.find("old") != std::string::npos
-                            ? CONCRETE_SCHEDULER_OLDEST_FIRST
-                            : sched_config.find("warp_limiting") !=
-                                      std::string::npos
-                                  ? CONCRETE_SCHEDULER_WARP_LIMITING
-                                  : NUM_CONCRETE_SCHEDULERS;
+      sched_config.find("lrr") != std::string::npos ? CONCRETE_SCHEDULER_LRR
+      : sched_config.find("two_level_active") != std::string::npos
+          ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
+      : sched_config.find("gto") != std::string::npos ? CONCRETE_SCHEDULER_GTO
+      : sched_config.find("rrr") != std::string::npos ? CONCRETE_SCHEDULER_RRR
+      : sched_config.find("old") != std::string::npos
+          ? CONCRETE_SCHEDULER_OLDEST_FIRST
+      : sched_config.find("warp_limiting") != std::string::npos
+          ? CONCRETE_SCHEDULER_WARP_LIMITING
+          : NUM_CONCRETE_SCHEDULERS;
   assert(scheduler != NUM_CONCRETE_SCHEDULERS);
 
   for (unsigned i = 0; i < m_config->gpgpu_num_sched_per_core; i++) {
@@ -206,6 +229,14 @@ void shader_core_ctx::create_schedulers() {
             &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg,
             &m_pipeline_reg[ID_OC_MEM], i));
         break;
+      case CONCRETE_SCHEDULER_RRR:
+        schedulers.push_back(new rrr_scheduler(
+            m_stats, this, m_scoreboard, m_simt_stack, &m_warp,
+            &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP],
+            &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT],
+            &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg,
+            &m_pipeline_reg[ID_OC_MEM], i));
+        break;
       case CONCRETE_SCHEDULER_OLDEST_FIRST:
         schedulers.push_back(new oldest_scheduler(
             m_stats, this, m_scoreboard, m_simt_stack, &m_warp,
@@ -377,41 +408,41 @@ void shader_core_ctx::create_exec_pipeline() {
 
   // m_fu = new simd_function_unit*[m_num_function_units];
 
-  for (int k = 0; k < m_config->gpgpu_num_sp_units; k++) {
-    m_fu.push_back(new sp_unit(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_sp_units; k++) {
+    m_fu.push_back(new sp_unit(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_SP);
     m_issue_port.push_back(OC_EX_SP);
   }
 
-  for (int k = 0; k < m_config->gpgpu_num_dp_units; k++) {
-    m_fu.push_back(new dp_unit(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_dp_units; k++) {
+    m_fu.push_back(new dp_unit(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_DP);
     m_issue_port.push_back(OC_EX_DP);
   }
-  for (int k = 0; k < m_config->gpgpu_num_int_units; k++) {
-    m_fu.push_back(new int_unit(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_int_units; k++) {
+    m_fu.push_back(new int_unit(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_INT);
     m_issue_port.push_back(OC_EX_INT);
   }
 
-  for (int k = 0; k < m_config->gpgpu_num_sfu_units; k++) {
-    m_fu.push_back(new sfu(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_sfu_units; k++) {
+    m_fu.push_back(new sfu(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_SFU);
     m_issue_port.push_back(OC_EX_SFU);
   }
 
-  for (int k = 0; k < m_config->gpgpu_num_tensor_core_units; k++) {
-    m_fu.push_back(new tensor_core(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_tensor_core_units; k++) {
+    m_fu.push_back(new tensor_core(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_TENSOR_CORE);
     m_issue_port.push_back(OC_EX_TENSOR_CORE);
   }
 
-  for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
+  for (unsigned j = 0; j < m_config->m_specialized_unit.size(); j++) {
     for (unsigned k = 0; k < m_config->m_specialized_unit[j].num_units; k++) {
       m_fu.push_back(new specialized_unit(
           &m_pipeline_reg[EX_WB], m_config, this, SPEC_UNIT_START_ID + j,
           m_config->m_specialized_unit[j].name,
-          m_config->m_specialized_unit[j].latency));
+          m_config->m_specialized_unit[j].latency, k));
       m_dispatch_port.push_back(m_config->m_specialized_unit[j].ID_OC_SPEC_ID);
       m_issue_port.push_back(m_config->m_specialized_unit[j].OC_EX_SPEC_ID);
     }
@@ -419,7 +450,7 @@ void shader_core_ctx::create_exec_pipeline() {
 
   m_ldst_unit = new ldst_unit(m_icnt, m_mem_fetch_allocator, this,
                               &m_operand_collector, m_scoreboard, m_config,
-                              m_memory_config, m_stats, m_sid, m_tpc);
+                              m_memory_config, m_stats, m_sid, m_tpc, m_gpu);
   m_fu.push_back(m_ldst_unit);
   m_dispatch_port.push_back(ID_OC_MEM);
   m_issue_port.push_back(OC_EX_MEM);
@@ -450,12 +481,16 @@ shader_core_ctx::shader_core_ctx(class gpgpu_sim *gpu,
   m_config = config;
   m_memory_config = mem_config;
   m_stats = stats;
-  unsigned warp_size = config->warp_size;
+  // unsigned warp_size = config->warp_size;
   Issue_Prio = 0;
 
   m_sid = shader_id;
   m_tpc = tpc_id;
 
+  if (get_gpu()->get_config().g_power_simulation_enabled) {
+    scaling_coeffs = get_gpu()->get_scaling_coeffs();
+  }
+
   m_last_inst_gpu_sim_cycle = 0;
   m_last_inst_gpu_tot_sim_cycle = 0;
 
@@ -497,7 +532,6 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread,
 void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread,
                                  unsigned end_thread, unsigned ctaid,
                                  int cta_size, kernel_info_t &kernel) {
-  //
   address_type start_pc = next_pc(start_thread);
   unsigned kernel_id = kernel.get_uid();
   if (m_config->model == POST_DOMINATOR) {
@@ -536,7 +570,8 @@ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread,
         start_pc = pc;
       }
 
-      m_warp[i]->init(start_pc, cta_id, i, active_threads, m_dynamic_warp_id);
+      m_warp[i]->init(start_pc, cta_id, i, active_threads, m_dynamic_warp_id,
+                      kernel.get_streamID());
       ++m_dynamic_warp_id;
       m_not_completed += n_active;
       ++m_active_warps;
@@ -607,7 +642,8 @@ void shader_core_stats::print(FILE *fout) const {
   fprintf(fout, "gpgpu_n_param_mem_insn = %d\n", gpgpu_n_param_insn);
 
   fprintf(fout, "gpgpu_n_shmem_bkconflict = %d\n", gpgpu_n_shmem_bkconflict);
-  fprintf(fout, "gpgpu_n_cache_bkconflict = %d\n", gpgpu_n_cache_bkconflict);
+  fprintf(fout, "gpgpu_n_l1cache_bkconflict = %d\n",
+          gpgpu_n_l1cache_bkconflict);
 
   fprintf(fout, "gpgpu_n_intrawarp_mshr_merge = %d\n",
           gpgpu_n_intrawarp_mshr_merge);
@@ -805,8 +841,8 @@ void shader_core_stats::visualizer_print(gzFile visualizer_file) {
   gzprintf(visualizer_file, "\n");
 
   // overall cache miss rates
-  gzprintf(visualizer_file, "gpgpu_n_cache_bkconflict: %d\n",
-           gpgpu_n_cache_bkconflict);
+  gzprintf(visualizer_file, "gpgpu_n_l1cache_bkconflict: %d\n",
+           gpgpu_n_l1cache_bkconflict);
   gzprintf(visualizer_file, "gpgpu_n_shmem_bkconflict: %d\n",
            gpgpu_n_shmem_bkconflict);
 
@@ -859,7 +895,9 @@ void shader_core_ctx::decode() {
     m_warp[m_inst_fetch_buffer.m_warp_id]->inc_inst_in_pipeline();
     if (pI1) {
       m_stats->m_num_decoded_insn[m_sid]++;
-      if (pI1->oprnd_type == INT_OP) {
+      if ((pI1->oprnd_type == INT_OP) ||
+          (pI1->oprnd_type == UN_OP)) {  // these counters get added up in mcPat
+                                         // to compute scheduler power
         m_stats->m_num_INTdecoded_insn[m_sid]++;
       } else if (pI1->oprnd_type == FP_OP) {
         m_stats->m_num_FPdecoded_insn[m_sid]++;
@@ -870,7 +908,9 @@ void shader_core_ctx::decode() {
         m_warp[m_inst_fetch_buffer.m_warp_id]->ibuffer_fill(1, pI2);
         m_warp[m_inst_fetch_buffer.m_warp_id]->inc_inst_in_pipeline();
         m_stats->m_num_decoded_insn[m_sid]++;
-        if (pI2->oprnd_type == INT_OP) {
+        if ((pI1->oprnd_type == INT_OP) ||
+            (pI1->oprnd_type == UN_OP)) {  // these counters get added up in
+                                           // mcPat to compute scheduler power
           m_stats->m_num_INTdecoded_insn[m_sid]++;
         } else if (pI2->oprnd_type == FP_OP) {
           m_stats->m_num_FPdecoded_insn[m_sid]++;
@@ -916,7 +956,8 @@ void shader_core_ctx::fetch() {
               m_threadState[tid].m_active = false;
               unsigned cta_id = m_warp[warp_id]->get_cta_id();
               if (m_thread[tid] == NULL) {
-                register_cta_thread_exit(cta_id, m_kernel);
+                register_cta_thread_exit(cta_id,
+                                         m_warp[warp_id]->get_kernel_info());
               } else {
                 register_cta_thread_exit(cta_id,
                                          &(m_thread[tid]->get_kernel()));
@@ -948,14 +989,15 @@ void shader_core_ctx::fetch() {
           // mem_fetch *mf = m_mem_fetch_allocator->alloc()
           mem_access_t acc(INST_ACC_R, ppc, nbytes, false, m_gpu->gpgpu_ctx);
           mem_fetch *mf = new mem_fetch(
-              acc, NULL /*we don't have an instruction yet*/, READ_PACKET_SIZE,
-              warp_id, m_sid, m_tpc, m_memory_config,
+              acc, NULL, m_warp[warp_id]->get_kernel_info()->get_streamID(),
+              READ_PACKET_SIZE, warp_id, m_sid, m_tpc, m_memory_config,
               m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
           std::list<cache_event> events;
           enum cache_request_status status;
-          if (m_config->perfect_inst_const_cache)
+          if (m_config->perfect_inst_const_cache) {
             status = HIT;
-          else
+            shader_cache_access_log(m_sid, INSTRUCTION, 0);
+          } else
             status = m_L1I->access(
                 (new_addr_type)ppc, mf,
                 m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
@@ -1002,13 +1044,31 @@ void shader_core_ctx::issue_warp(register_set &pipe_reg_set,
   m_warp[warp_id]->ibuffer_free();
   assert(next_inst->valid());
   **pipe_reg = *next_inst;  // static instruction information
-  (*pipe_reg)->issue(active_mask, warp_id,
-                     m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,
-                     m_warp[warp_id]->get_dynamic_warp_id(),
-                     sch_id);  // dynamic instruction information
+  (*pipe_reg)->issue(
+      active_mask, warp_id, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,
+      m_warp[warp_id]->get_dynamic_warp_id(), sch_id,
+      m_warp[warp_id]->get_streamID());  // dynamic instruction information
   m_stats->shader_cycle_distro[2 + (*pipe_reg)->active_count()]++;
   func_exec_inst(**pipe_reg);
 
+  // Add LDGSTS instructions into a buffer
+  unsigned int ldgdepbar_id = m_warp[warp_id]->m_ldgdepbar_id;
+  if (next_inst->m_is_ldgsts) {
+    if (m_warp[warp_id]->m_ldgdepbar_buf.size() == ldgdepbar_id + 1) {
+      m_warp[warp_id]->m_ldgdepbar_buf[ldgdepbar_id].push_back(*next_inst);
+    } else {
+      assert(m_warp[warp_id]->m_ldgdepbar_buf.size() < ldgdepbar_id + 1);
+      std::vector<warp_inst_t> l;
+      l.push_back(*next_inst);
+      m_warp[warp_id]->m_ldgdepbar_buf.push_back(l);
+    }
+    // If the mask of the instruction is all 0, then the address is also 0,
+    // so that there's no need to check through the writeback
+    if (next_inst->get_active_mask() == 0) {
+      (m_warp[warp_id]->m_ldgdepbar_buf.back()).back().pc = -1;
+    }
+  }
+
   if (next_inst->op == BARRIER_OP) {
     m_warp[warp_id]->store_info_of_last_inst_at_barrier(*pipe_reg);
     m_barriers.warp_reaches_barrier(m_warp[warp_id]->get_cta_id(), warp_id,
@@ -1016,6 +1076,47 @@ void shader_core_ctx::issue_warp(register_set &pipe_reg_set,
 
   } else if (next_inst->op == MEMORY_BARRIER_OP) {
     m_warp[warp_id]->set_membar();
+  } else if (next_inst->m_is_ldgdepbar) {  // Add for LDGDEPBAR
+    m_warp[warp_id]->m_ldgdepbar_id++;
+    // If there are no added LDGSTS, insert an empty vector
+    if (m_warp[warp_id]->m_ldgdepbar_buf.size() != ldgdepbar_id + 1) {
+      assert(m_warp[warp_id]->m_ldgdepbar_buf.size() < ldgdepbar_id + 1);
+      std::vector<warp_inst_t> l;
+      m_warp[warp_id]->m_ldgdepbar_buf.push_back(l);
+    }
+  } else if (next_inst->m_is_depbar) {  // Add for DEPBAR
+    // Set to true immediately when a DEPBAR instruction is met
+    m_warp[warp_id]->m_waiting_ldgsts = true;
+    m_warp[warp_id]->m_depbar_group =
+        next_inst->m_depbar_group_no;  // set in trace_driven.cc
+
+    // Record the last group that's possbily being monitored by this DEPBAR
+    // instr
+    m_warp[warp_id]->m_depbar_start_id = m_warp[warp_id]->m_ldgdepbar_id - 1;
+
+    // Record the last group that's actually being monitored by this DEPBAR
+    // instr
+    unsigned int end_group =
+        m_warp[warp_id]->m_ldgdepbar_id - m_warp[warp_id]->m_depbar_group;
+
+    // Check for the case that the LDGSTSs monitored have finished when
+    // encountering the DEPBAR instruction
+    bool done_flag = true;
+    for (int i = 0; i < end_group; i++) {
+      for (int j = 0; j < m_warp[warp_id]->m_ldgdepbar_buf[i].size(); j++) {
+        if (m_warp[warp_id]->m_ldgdepbar_buf[i][j].pc != -1) {
+          done_flag = false;
+          goto UpdateDEPBAR;
+        }
+      }
+    }
+
+  UpdateDEPBAR:
+    if (done_flag) {
+      if (m_warp[warp_id]->m_waiting_ldgsts) {
+        m_warp[warp_id]->m_waiting_ldgsts = false;
+      }
+    }
   }
 
   updateSIMTStack(warp_id, *pipe_reg);
@@ -1082,6 +1183,34 @@ void scheduler_unit::order_lrr(
   }
 }
 
+template <class T>
+void scheduler_unit::order_rrr(
+    std::vector<T> &result_list, const typename std::vector<T> &input_list,
+    const typename std::vector<T>::const_iterator &last_issued_from_input,
+    unsigned num_warps_to_add) {
+  result_list.clear();
+
+  if (m_num_issued_last_cycle > 0 || warp(m_current_turn_warp).done_exit() ||
+      warp(m_current_turn_warp).waiting()) {
+    std::vector<shd_warp_t *>::const_iterator iter =
+        (last_issued_from_input == input_list.end())
+            ? input_list.begin()
+            : last_issued_from_input + 1;
+    for (unsigned count = 0; count < num_warps_to_add; ++iter, ++count) {
+      if (iter == input_list.end()) {
+        iter = input_list.begin();
+      }
+      unsigned warp_id = (*iter)->get_warp_id();
+      if (!(*iter)->done_exit() && !(*iter)->waiting()) {
+        result_list.push_back(*iter);
+        m_current_turn_warp = warp_id;
+        break;
+      }
+    }
+  } else {
+    result_list.push_back(&warp(m_current_turn_warp));
+  }
+}
 /**
  * A general function to order things in an priority-based way.
  * The core usage of the function is similar to order_lrr.
@@ -1228,29 +1357,21 @@ void scheduler_unit::cycle() {
                 previous_issued_inst_exec_type = exec_unit_type_t::MEM;
               }
             } else {
-              bool sp_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_sp_units > 0) &&
-                  m_sp_out->has_free(m_shader->m_config->sub_core_model, m_id);
-              bool sfu_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_sfu_units > 0) &&
-                  m_sfu_out->has_free(m_shader->m_config->sub_core_model, m_id);
-              bool tensor_core_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_tensor_core_units > 0) &&
-                  m_tensor_core_out->has_free(
-                      m_shader->m_config->sub_core_model, m_id);
-              bool dp_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_dp_units > 0) &&
-                  m_dp_out->has_free(m_shader->m_config->sub_core_model, m_id);
-              bool int_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_int_units > 0) &&
-                  m_int_out->has_free(m_shader->m_config->sub_core_model, m_id);
-
               // This code need to be refactored
               if (pI->op != TENSOR_CORE_OP && pI->op != SFU_OP &&
                   pI->op != DP_OP && !(pI->op >= SPEC_UNIT_START_ID)) {
                 bool execute_on_SP = false;
                 bool execute_on_INT = false;
 
+                bool sp_pipe_avail =
+                    (m_shader->m_config->gpgpu_num_sp_units > 0) &&
+                    m_sp_out->has_free(m_shader->m_config->sub_core_model,
+                                       m_id);
+                bool int_pipe_avail =
+                    (m_shader->m_config->gpgpu_num_int_units > 0) &&
+                    m_int_out->has_free(m_shader->m_config->sub_core_model,
+                                        m_id);
+
                 // if INT unit pipline exist, then execute ALU and INT
                 // operations on INT unit and SP-FPU on SP unit (like in Volta)
                 // if INT unit pipline does not exist, then execute all ALU, INT
@@ -1311,6 +1432,11 @@ void scheduler_unit::cycle() {
                          (pI->op == DP_OP) &&
                          !(diff_exec_units && previous_issued_inst_exec_type ==
                                                   exec_unit_type_t::DP)) {
+                bool dp_pipe_avail =
+                    (m_shader->m_config->gpgpu_num_dp_units > 0) &&
+                    m_dp_out->has_free(m_shader->m_config->sub_core_model,
+                                       m_id);
+
                 if (dp_pipe_avail) {
                   m_shader->issue_warp(*m_dp_out, pI, active_mask, warp_id,
                                        m_id);
@@ -1326,6 +1452,11 @@ void scheduler_unit::cycle() {
                         (pI->op == SFU_OP) || (pI->op == ALU_SFU_OP)) &&
                        !(diff_exec_units && previous_issued_inst_exec_type ==
                                                 exec_unit_type_t::SFU)) {
+                bool sfu_pipe_avail =
+                    (m_shader->m_config->gpgpu_num_sfu_units > 0) &&
+                    m_sfu_out->has_free(m_shader->m_config->sub_core_model,
+                                        m_id);
+
                 if (sfu_pipe_avail) {
                   m_shader->issue_warp(*m_sfu_out, pI, active_mask, warp_id,
                                        m_id);
@@ -1337,6 +1468,11 @@ void scheduler_unit::cycle() {
               } else if ((pI->op == TENSOR_CORE_OP) &&
                          !(diff_exec_units && previous_issued_inst_exec_type ==
                                                   exec_unit_type_t::TENSOR)) {
+                bool tensor_core_pipe_avail =
+                    (m_shader->m_config->gpgpu_num_tensor_core_units > 0) &&
+                    m_tensor_core_out->has_free(
+                        m_shader->m_config->sub_core_model, m_id);
+
                 if (tensor_core_pipe_avail) {
                   m_shader->issue_warp(*m_tensor_core_out, pI, active_mask,
                                        warp_id, m_id);
@@ -1407,7 +1543,7 @@ void scheduler_unit::cycle() {
           m_last_supervised_issued = supervised_iter;
         }
       }
-
+      m_num_issued_last_cycle = issued;
       if (issued == 1)
         m_stats->single_issue_nums[m_id]++;
       else if (issued > 1)
@@ -1456,6 +1592,10 @@ void lrr_scheduler::order_warps() {
   order_lrr(m_next_cycle_prioritized_warps, m_supervised_warps,
             m_last_supervised_issued, m_supervised_warps.size());
 }
+void rrr_scheduler::order_warps() {
+  order_rrr(m_next_cycle_prioritized_warps, m_supervised_warps,
+            m_last_supervised_issued, m_supervised_warps.size());
+}
 
 void gto_scheduler::order_warps() {
   order_by_priority(m_next_cycle_prioritized_warps, m_supervised_warps,
@@ -1569,7 +1709,10 @@ void swl_scheduler::order_warps() {
   }
 }
 
-void shader_core_ctx::read_operands() {}
+void shader_core_ctx::read_operands() {
+  for (unsigned int i = 0; i < m_config->reg_file_port_throughput; ++i)
+    m_operand_collector.step();
+}
 
 address_type coalesced_segment(address_type addr,
                                unsigned segment_size_lg2bytes) {
@@ -1669,8 +1812,15 @@ void shader_core_ctx::execute() {
     m_fu[n]->active_lanes_in_pipeline();
     unsigned issue_port = m_issue_port[n];
     register_set &issue_inst = m_pipeline_reg[issue_port];
-    warp_inst_t **ready_reg = issue_inst.get_ready();
-    if (issue_inst.has_ready() && m_fu[n]->can_issue(**ready_reg)) {
+    unsigned reg_id;
+    bool partition_issue =
+        m_config->sub_core_model && m_fu[n]->is_issue_partitioned();
+    if (partition_issue) {
+      reg_id = m_fu[n]->get_issue_reg_id();
+    }
+    warp_inst_t **ready_reg = issue_inst.get_ready(partition_issue, reg_id);
+    if (issue_inst.has_ready(partition_issue, reg_id) &&
+        m_fu[n]->can_issue(**ready_reg)) {
       bool schedule_wb_now = !m_fu[n]->stallable();
       int resbus = -1;
       if (schedule_wb_now &&
@@ -1711,12 +1861,55 @@ void ldst_unit::get_L1T_sub_stats(struct cache_sub_stats &css) const {
   if (m_L1T) m_L1T->get_sub_stats(css);
 }
 
+// Add this function to unset depbar
+void shader_core_ctx::unset_depbar(const warp_inst_t &inst) {
+  bool done_flag = true;
+  unsigned int end_group = m_warp[inst.warp_id()]->m_depbar_start_id == 0
+                               ? m_warp[inst.warp_id()]->m_ldgdepbar_buf.size()
+                               : (m_warp[inst.warp_id()]->m_depbar_start_id -
+                                  m_warp[inst.warp_id()]->m_depbar_group + 1);
+
+  if (inst.m_is_ldgsts) {
+    for (int i = 0; i < m_warp[inst.warp_id()]->m_ldgdepbar_buf.size(); i++) {
+      for (int j = 0; j < m_warp[inst.warp_id()]->m_ldgdepbar_buf[i].size();
+           j++) {
+        if (m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].pc == inst.pc) {
+          // Handle the case that same pc results in multiple LDGSTS
+          // instructions
+          if (m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].get_addr(0) ==
+              inst.get_addr(0)) {
+            m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].pc = -1;
+            goto DoneWB;
+          }
+        }
+      }
+    }
+
+  DoneWB:
+    for (int i = 0; i < end_group; i++) {
+      for (int j = 0; j < m_warp[inst.warp_id()]->m_ldgdepbar_buf[i].size();
+           j++) {
+        if (m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].pc != -1) {
+          done_flag = false;
+          goto UpdateDEPBAR;
+        }
+      }
+    }
+
+  UpdateDEPBAR:
+    if (done_flag) {
+      if (m_warp[inst.warp_id()]->m_waiting_ldgsts) {
+        m_warp[inst.warp_id()]->m_waiting_ldgsts = false;
+      }
+    }
+  }
+}
+
 void shader_core_ctx::warp_inst_complete(const warp_inst_t &inst) {
 #if 0
       printf("[warp_inst_complete] uid=%u core=%u warp=%u pc=%#x @ time=%llu \n",
              inst.get_uid(), m_sid, inst.warp_id(), inst.pc,  m_gpu->gpu_tot_sim_cycle +  m_gpu->gpu_sim_cycle);
 #endif
-
   if (inst.op_pipe == SP__OP)
     m_stats->m_num_sp_committed[m_sid]++;
   else if (inst.op_pipe == SFU__OP)
@@ -1795,6 +1988,7 @@ bool ldst_unit::shared_cycle(warp_inst_t &inst, mem_stage_stall_type &rc_fail,
   if (stall) {
     fail_type = S_MEM;
     rc_fail = BK_CONF;
+    m_stats->gpgpu_n_shmem_bkconflict++;
   } else
     rc_fail = NO_RC_FAIL;
   return !stall;
@@ -1821,6 +2015,14 @@ mem_stage_stall_type ldst_unit::process_cache_access(
     if (inst.is_load()) {
       for (unsigned r = 0; r < MAX_OUTPUT_VALUES; r++)
         if (inst.out[r] > 0) m_pending_writes[inst.warp_id()][inst.out[r]]--;
+
+      // release LDGSTS
+      if (inst.m_is_ldgsts) {
+        m_pending_ldgsts[inst.warp_id()][inst.pc][inst.get_addr(0)]--;
+        if (m_pending_ldgsts[inst.warp_id()][inst.pc][inst.get_addr(0)] == 0) {
+          m_core->unset_depbar(inst);
+        }
+      }
     }
     if (!write_sent) delete mf;
   } else if (status == RESERVATION_FAIL) {
@@ -1863,7 +2065,7 @@ mem_stage_stall_type ldst_unit::process_memory_access_queue_l1cache(
   if (inst.accessq_empty()) return result;
 
   if (m_config->m_L1D_config.l1_latency > 0) {
-    for (int j = 0; j < m_config->m_L1D_config.l1_banks;
+    for (unsigned int j = 0; j < m_config->m_L1D_config.l1_banks;
          j++) {  // We can handle at max l1_banks reqs per cycle
 
       if (inst.accessq_empty()) return result;
@@ -1892,6 +2094,7 @@ mem_stage_stall_type ldst_unit::process_memory_access_queue_l1cache(
         inst.accessq_pop_back();
       } else {
         result = BK_CONF;
+        m_stats->gpgpu_n_l1cache_bkconflict++;
         delete mf;
         break;  // do not try again, just break from the loop and try the next
                 // cycle
@@ -1916,7 +2119,7 @@ mem_stage_stall_type ldst_unit::process_memory_access_queue_l1cache(
 }
 
 void ldst_unit::L1_latency_queue_cycle() {
-  for (int j = 0; j < m_config->m_L1D_config.l1_banks; j++) {
+  for (unsigned int j = 0; j < m_config->m_L1D_config.l1_banks; j++) {
     if ((l1_latency_queue[j][0]) != NULL) {
       mem_fetch *mf_next = l1_latency_queue[j][0];
       std::list<cache_event> events;
@@ -1948,6 +2151,18 @@ void ldst_unit::L1_latency_queue_cycle() {
                 m_core->warp_inst_complete(mf_next->get_inst());
               }
             }
+
+          // release LDGSTS
+          if (mf_next->get_inst().m_is_ldgsts) {
+            m_pending_ldgsts[mf_next->get_inst().warp_id()]
+                            [mf_next->get_inst().pc]
+                            [mf_next->get_inst().get_addr(0)]--;
+            if (m_pending_ldgsts[mf_next->get_inst().warp_id()]
+                                [mf_next->get_inst().pc]
+                                [mf_next->get_inst().get_addr(0)] == 0) {
+              m_core->unset_depbar(mf_next->get_inst());
+            }
+          }
         }
 
         // For write hit in WB policy
@@ -1970,6 +2185,21 @@ void ldst_unit::L1_latency_queue_cycle() {
       } else {
         assert(status == MISS || status == HIT_RESERVED);
         l1_latency_queue[j][0] = NULL;
+        if (m_config->m_L1D_config.get_write_policy() != WRITE_THROUGH &&
+            mf_next->get_inst().is_store() &&
+            (m_config->m_L1D_config.get_write_allocate_policy() ==
+                 FETCH_ON_WRITE ||
+             m_config->m_L1D_config.get_write_allocate_policy() ==
+                 LAZY_FETCH_ON_READ) &&
+            !was_writeallocate_sent(events)) {
+          unsigned dec_ack =
+              (m_config->m_L1D_config.get_mshr_type() == SECTOR_ASSOC)
+                  ? (mf_next->get_data_size() / SECTOR_SIZE)
+                  : 1;
+          mf_next->set_reply();
+          for (unsigned i = 0; i < dec_ack; ++i) m_core->store_ack(mf_next);
+          if (!write_sent && !read_sent) delete mf_next;
+        }
       }
     }
 
@@ -1992,10 +2222,12 @@ bool ldst_unit::constant_cycle(warp_inst_t &inst, mem_stage_stall_type &rc_fail,
   mem_stage_stall_type fail;
   if (m_config->perfect_inst_const_cache) {
     fail = NO_RC_FAIL;
+    unsigned access_count = inst.accessq_count();
     while (inst.accessq_count() > 0) inst.accessq_pop_back();
     if (inst.is_load()) {
       for (unsigned r = 0; r < MAX_OUTPUT_VALUES; r++)
-        if (inst.out[r] > 0) m_pending_writes[inst.warp_id()][inst.out[r]]--;
+        if (inst.out[r] > 0)
+          m_pending_writes[inst.warp_id()][inst.out[r]] -= access_count;
     }
   } else {
     fail = process_memory_access_queue(m_L1C, inst);
@@ -2052,7 +2284,15 @@ bool ldst_unit::memory_cycle(warp_inst_t &inst,
         inst.is_store() ? WRITE_PACKET_SIZE : READ_PACKET_SIZE;
     unsigned size = access.get_size() + control_size;
     // printf("Interconnect:Addr: %x, size=%d\n",access.get_addr(),size);
-    if (m_icnt->full(size, inst.is_store() || inst.isatomic())) {
+    if (m_memory_config->SST_mode &&
+        (static_cast<sst_memory_interface *>(m_icnt)->full(
+            size, inst.is_store() || inst.isatomic(), access.get_type()))) {
+      // SST need mf type here
+      // Cast it to sst_memory_interface pointer first as this full() method
+      // is not a virtual method in parent class
+      stall_cond = ICNT_RC_FAIL;
+    } else if (!m_memory_config->SST_mode &&
+               (m_icnt->full(size, inst.is_store() || inst.isatomic()))) {
       stall_cond = ICNT_RC_FAIL;
     } else {
       mem_fetch *mf =
@@ -2112,22 +2352,32 @@ simd_function_unit::simd_function_unit(const shader_core_config *config) {
   m_dispatch_reg = new warp_inst_t(config);
 }
 
+void simd_function_unit::issue(register_set &source_reg) {
+  bool partition_issue =
+      m_config->sub_core_model && this->is_issue_partitioned();
+  source_reg.move_out_to(partition_issue, this->get_issue_reg_id(),
+                         m_dispatch_reg);
+  occupied.set(m_dispatch_reg->latency);
+}
+
 sfu::sfu(register_set *result_port, const shader_core_config *config,
-         shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_sfu_latency, core) {
+         shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_sfu_latency, core,
+                          issue_reg_id) {
   m_name = "SFU";
 }
 
 tensor_core::tensor_core(register_set *result_port,
                          const shader_core_config *config,
-                         shader_core_ctx *core)
+                         shader_core_ctx *core, unsigned issue_reg_id)
     : pipelined_simd_unit(result_port, config, config->max_tensor_core_latency,
-                          core) {
+                          core, issue_reg_id) {
   m_name = "TENSOR_CORE";
 }
 
 void sfu::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
 
   (*ready_reg)->op_pipe = SFU__OP;
@@ -2136,7 +2386,8 @@ void sfu::issue(register_set &source_reg) {
 }
 
 void tensor_core::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
 
   (*ready_reg)->op_pipe = TENSOR_CORE__OP;
@@ -2172,7 +2423,7 @@ void sp_unit::active_lanes_in_pipeline() {
 void dp_unit::active_lanes_in_pipeline() {
   unsigned active_count = pipelined_simd_unit::get_active_lanes_in_pipeline();
   assert(active_count <= m_core->get_config()->warp_size);
-  m_core->incspactivelanes_stat(active_count);
+  // m_core->incspactivelanes_stat(active_count);
   m_core->incfuactivelanes_stat(active_count);
   m_core->incfumemactivelanes_stat(active_count);
 }
@@ -2208,34 +2459,39 @@ void tensor_core::active_lanes_in_pipeline() {
 }
 
 sp_unit::sp_unit(register_set *result_port, const shader_core_config *config,
-                 shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_sp_latency, core) {
+                 shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_sp_latency, core,
+                          issue_reg_id) {
   m_name = "SP ";
 }
 
 specialized_unit::specialized_unit(register_set *result_port,
                                    const shader_core_config *config,
-                                   shader_core_ctx *core, unsigned supported_op,
-                                   char *unit_name, unsigned latency)
-    : pipelined_simd_unit(result_port, config, latency, core) {
+                                   shader_core_ctx *core, int supported_op,
+                                   char *unit_name, unsigned latency,
+                                   unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, latency, core, issue_reg_id) {
   m_name = unit_name;
   m_supported_op = supported_op;
 }
 
 dp_unit::dp_unit(register_set *result_port, const shader_core_config *config,
-                 shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_dp_latency, core) {
+                 shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_dp_latency, core,
+                          issue_reg_id) {
   m_name = "DP ";
 }
 
 int_unit::int_unit(register_set *result_port, const shader_core_config *config,
-                   shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_int_latency, core) {
+                   shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_int_latency, core,
+                          issue_reg_id) {
   m_name = "INT ";
 }
 
 void sp_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = SP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2243,7 +2499,8 @@ void sp_unit ::issue(register_set &source_reg) {
 }
 
 void dp_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = DP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2251,7 +2508,8 @@ void dp_unit ::issue(register_set &source_reg) {
 }
 
 void specialized_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = SPECIALIZED__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2259,7 +2517,8 @@ void specialized_unit ::issue(register_set &source_reg) {
 }
 
 void int_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = INTP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2269,7 +2528,8 @@ void int_unit ::issue(register_set &source_reg) {
 pipelined_simd_unit::pipelined_simd_unit(register_set *result_port,
                                          const shader_core_config *config,
                                          unsigned max_latency,
-                                         shader_core_ctx *core)
+                                         shader_core_ctx *core,
+                                         unsigned issue_reg_id)
     : simd_function_unit(config) {
   m_result_port = result_port;
   m_pipeline_depth = max_latency;
@@ -2277,6 +2537,7 @@ pipelined_simd_unit::pipelined_simd_unit(register_set *result_port,
   for (unsigned i = 0; i < m_pipeline_depth; i++)
     m_pipeline_reg[i] = new warp_inst_t(config);
   m_core = core;
+  m_issue_reg_id = issue_reg_id;
   active_insts_in_pipeline = 0;
 }
 
@@ -2294,8 +2555,10 @@ void pipelined_simd_unit::cycle() {
     if (!m_dispatch_reg->dispatch_delay()) {
       int start_stage =
           m_dispatch_reg->latency - m_dispatch_reg->initiation_interval;
-      move_warp(m_pipeline_reg[start_stage], m_dispatch_reg);
-      active_insts_in_pipeline++;
+      if (m_pipeline_reg[start_stage]->empty()) {
+        move_warp(m_pipeline_reg[start_stage], m_dispatch_reg);
+        active_insts_in_pipeline++;
+      }
     }
   }
   occupied >>= 1;
@@ -2303,7 +2566,10 @@ void pipelined_simd_unit::cycle() {
 
 void pipelined_simd_unit::issue(register_set &source_reg) {
   // move_warp(m_dispatch_reg,source_reg);
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  bool partition_issue =
+      m_config->sub_core_model && this->is_issue_partitioned();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(partition_issue, m_issue_reg_id);
   m_core->incexecstat((*ready_reg));
   // source_reg.move_out_to(m_dispatch_reg);
   simd_function_unit::issue(source_reg);
@@ -2343,7 +2609,7 @@ void ldst_unit::init(mem_fetch_interface *icnt,
                         IN_SHADER_L1T_ROB);
   m_L1C = new read_only_cache(L1C_name, m_config->m_L1C_config, m_sid,
                               get_shader_constant_cache_id(), icnt,
-                              IN_L1C_MISS_QUEUE);
+                              IN_L1C_MISS_QUEUE, OTHER_GPU_CACHE, m_gpu);
   m_L1D = NULL;
   m_mem_rc = NO_RC_FAIL;
   m_num_writeback_clients =
@@ -2359,9 +2625,10 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
                      shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
-                     unsigned sid, unsigned tpc)
-    : pipelined_simd_unit(NULL, config, config->smem_latency, core),
-      m_next_wb(config) {
+                     unsigned sid, unsigned tpc, gpgpu_sim *gpu)
+    : pipelined_simd_unit(NULL, config, config->smem_latency, core, 0),
+      m_next_wb(config),
+      m_gpu(gpu) {
   assert(config->smem_latency > 1);
   init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
        mem_config, stats, sid, tpc);
@@ -2370,7 +2637,7 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
     snprintf(L1D_name, STRSIZE, "L1D_%03d", m_sid);
     m_L1D = new l1_cache(L1D_name, m_config->m_L1D_config, m_sid,
                          get_shader_normal_cache_id(), m_icnt, m_mf_allocator,
-                         IN_L1D_MISS_QUEUE, core->get_gpu());
+                         IN_L1D_MISS_QUEUE, core->get_gpu(), L1_GPU_CACHE);
 
     l1_latency_queue.resize(m_config->m_L1D_config.l1_banks);
     assert(m_config->m_L1D_config.l1_latency > 0);
@@ -2388,7 +2655,7 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
                      unsigned sid, unsigned tpc, l1_cache *new_l1d_cache)
-    : pipelined_simd_unit(NULL, config, 3, core),
+    : pipelined_simd_unit(NULL, config, 3, core, 0),
       m_L1D(new_l1d_cache),
       m_next_wb(config) {
   init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
@@ -2410,6 +2677,9 @@ void ldst_unit::issue(register_set &reg_set) {
         m_pending_writes[warp_id][reg_id] += n_accesses;
       }
     }
+    if (inst->m_is_ldgsts) {
+      m_pending_ldgsts[warp_id][inst->pc][inst->get_addr(0)] += n_accesses;
+    }
   }
 
   inst->op_pipe = MEM__OP;
@@ -2441,11 +2711,24 @@ void ldst_unit::writeback() {
                                           m_next_wb.out[r]);
             insn_completed = true;
           }
+        } else if (m_next_wb.m_is_ldgsts) {  // for LDGSTS instructions where no
+                                             // output register is used
+          m_pending_ldgsts[m_next_wb.warp_id()][m_next_wb.pc]
+                          [m_next_wb.get_addr(0)]--;
+          if (m_pending_ldgsts[m_next_wb.warp_id()][m_next_wb.pc]
+                              [m_next_wb.get_addr(0)] == 0) {
+            insn_completed = true;
+          }
+          break;
         }
       }
       if (insn_completed) {
         m_core->warp_inst_complete(m_next_wb);
+        if (m_next_wb.m_is_ldgsts) {
+          m_core->unset_depbar(m_next_wb);
+        }
       }
+
       m_next_wb.clear();
       m_last_inst_gpu_sim_cycle = m_core->get_gpu()->gpu_sim_cycle;
       m_last_inst_gpu_tot_sim_cycle = m_core->get_gpu()->gpu_tot_sim_cycle;
@@ -2550,8 +2833,7 @@ inst->space.get_type() != shared_space) { unsigned warp_id = inst->warp_id();
 */
 void ldst_unit::cycle() {
   writeback();
-  for (int i = 0; i < m_config->reg_file_port_throughput; ++i)
-    m_operand_collector->step();
+
   for (unsigned stage = 0; (stage + 1) < m_pipeline_depth; stage++)
     if (m_pipeline_reg[stage]->empty() && !m_pipeline_reg[stage + 1]->empty())
       move_warp(m_pipeline_reg[stage], m_pipeline_reg[stage + 1]);
@@ -2575,7 +2857,10 @@ void ldst_unit::cycle() {
       }
     } else {
       if (mf->get_type() == WRITE_ACK ||
-          (m_config->gpgpu_perfect_mem && mf->get_is_write())) {
+          ((m_config->gpgpu_perfect_mem || m_memory_config->SST_mode) &&
+           mf->get_is_write())) {
+        // SST memory is handled by SST mem hierarchy
+        // Perfect mem
         m_core->store_ack(mf);
         m_response_fifo.pop_front();
         delete mf;
@@ -2668,6 +2953,15 @@ void ldst_unit::cycle() {
         if (!pending_requests) {
           m_core->warp_inst_complete(*m_dispatch_reg);
           m_scoreboard->releaseRegisters(m_dispatch_reg);
+
+          // release LDGSTS
+          if (m_dispatch_reg->m_is_ldgsts) {
+            // m_pending_ldgsts[m_dispatch_reg->warp_id()][m_dispatch_reg->pc][m_dispatch_reg->get_addr(0)]--;
+            if (m_pending_ldgsts[m_dispatch_reg->warp_id()][m_dispatch_reg->pc]
+                                [m_dispatch_reg->get_addr(0)] == 0) {
+              m_core->unset_depbar(*m_dispatch_reg);
+            }
+          }
         }
         m_core->dec_inst_in_pipeline(warp_id);
         m_dispatch_reg->clear();
@@ -2956,7 +3250,7 @@ void warp_inst_t::print(FILE *fout) const {
     fprintf(fout, "bubble\n");
     return;
   } else
-    fprintf(fout, "0x%04x ", pc);
+    fprintf(fout, "0x%04llx ", pc);
   fprintf(fout, "w%02d[", m_warp_id);
   for (unsigned j = 0; j < m_config->warp_size; j++)
     fprintf(fout, "%c", (active(j) ? '1' : '0'));
@@ -2965,51 +3259,68 @@ void warp_inst_t::print(FILE *fout) const {
   fprintf(fout, "\n");
 }
 void shader_core_ctx::incexecstat(warp_inst_t *&inst) {
-  if (inst->mem_op == TEX) inctex_stat(inst->active_count(), 1);
-
   // Latency numbers for next operations are used to scale the power values
   // for special operations, according observations from microbenchmarking
   // TODO: put these numbers in the xml configuration
-
-  switch (inst->sp_op) {
-    case INT__OP:
-      incialu_stat(inst->active_count(), 32);
-      break;
-    case INT_MUL_OP:
-      incimul_stat(inst->active_count(), 7.2);
-      break;
-    case INT_MUL24_OP:
-      incimul24_stat(inst->active_count(), 4.2);
-      break;
-    case INT_MUL32_OP:
-      incimul32_stat(inst->active_count(), 4);
-      break;
-    case INT_DIV_OP:
-      incidiv_stat(inst->active_count(), 40);
-      break;
-    case FP__OP:
-      incfpalu_stat(inst->active_count(), 1);
-      break;
-    case FP_MUL_OP:
-      incfpmul_stat(inst->active_count(), 1.8);
-      break;
-    case FP_DIV_OP:
-      incfpdiv_stat(inst->active_count(), 48);
-      break;
-    case FP_SQRT_OP:
-      inctrans_stat(inst->active_count(), 25);
-      break;
-    case FP_LG_OP:
-      inctrans_stat(inst->active_count(), 35);
-      break;
-    case FP_SIN_OP:
-      inctrans_stat(inst->active_count(), 12);
-      break;
-    case FP_EXP_OP:
-      inctrans_stat(inst->active_count(), 35);
-      break;
-    default:
-      break;
+  if (get_gpu()->get_config().g_power_simulation_enabled) {
+    switch (inst->sp_op) {
+      case INT__OP:
+        incialu_stat(inst->active_count(), scaling_coeffs->int_coeff);
+        break;
+      case INT_MUL_OP:
+        incimul_stat(inst->active_count(), scaling_coeffs->int_mul_coeff);
+        break;
+      case INT_MUL24_OP:
+        incimul24_stat(inst->active_count(), scaling_coeffs->int_mul24_coeff);
+        break;
+      case INT_MUL32_OP:
+        incimul32_stat(inst->active_count(), scaling_coeffs->int_mul32_coeff);
+        break;
+      case INT_DIV_OP:
+        incidiv_stat(inst->active_count(), scaling_coeffs->int_div_coeff);
+        break;
+      case FP__OP:
+        incfpalu_stat(inst->active_count(), scaling_coeffs->fp_coeff);
+        break;
+      case FP_MUL_OP:
+        incfpmul_stat(inst->active_count(), scaling_coeffs->fp_mul_coeff);
+        break;
+      case FP_DIV_OP:
+        incfpdiv_stat(inst->active_count(), scaling_coeffs->fp_div_coeff);
+        break;
+      case DP___OP:
+        incdpalu_stat(inst->active_count(), scaling_coeffs->dp_coeff);
+        break;
+      case DP_MUL_OP:
+        incdpmul_stat(inst->active_count(), scaling_coeffs->dp_mul_coeff);
+        break;
+      case DP_DIV_OP:
+        incdpdiv_stat(inst->active_count(), scaling_coeffs->dp_div_coeff);
+        break;
+      case FP_SQRT_OP:
+        incsqrt_stat(inst->active_count(), scaling_coeffs->sqrt_coeff);
+        break;
+      case FP_LG_OP:
+        inclog_stat(inst->active_count(), scaling_coeffs->log_coeff);
+        break;
+      case FP_SIN_OP:
+        incsin_stat(inst->active_count(), scaling_coeffs->sin_coeff);
+        break;
+      case FP_EXP_OP:
+        incexp_stat(inst->active_count(), scaling_coeffs->exp_coeff);
+        break;
+      case TENSOR__OP:
+        inctensor_stat(inst->active_count(), scaling_coeffs->tensor_coeff);
+        break;
+      case TEX__OP:
+        inctex_stat(inst->active_count(), scaling_coeffs->tex_coeff);
+        break;
+      default:
+        break;
+    }
+    if (inst->const_cache_operand)  // warp has const address space load as one
+                                    // operand
+      inc_const_accesses(1);
   }
 }
 void shader_core_ctx::print_stage(unsigned int stage, FILE *fout) const {
@@ -3125,7 +3436,7 @@ void shader_core_ctx::display_pipeline(FILE *fout, int print_mem,
   if (!m_inst_fetch_buffer.m_valid)
     fprintf(fout, "bubble\n");
   else {
-    fprintf(fout, "w%2u : pc = 0x%x, nbytes = %u\n",
+    fprintf(fout, "w%2u : pc = 0x%llx, nbytes = %u\n",
             m_inst_fetch_buffer.m_warp_id, m_inst_fetch_buffer.m_pc,
             m_inst_fetch_buffer.m_nbytes);
   }
@@ -3264,49 +3575,46 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
   if (adaptive_cache_config && !k.cache_config_set) {
     // For more info about adaptive cache, see
     // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-    unsigned total_shmed = kernel_info->smem * result;
-    assert(total_shmed >= 0 && total_shmed <= gpgpu_shmem_size);
-    // assert(gpgpu_shmem_size == 98304); //Volta has 96 KB shared
-    // assert(m_L1D_config.get_nset() == 4);  //Volta L1 has four sets
-    if (total_shmed < gpgpu_shmem_size) {
-      switch (adaptive_cache_config) {
-        case FIXED:
-          break;
-        case ADAPTIVE_VOLTA: {
-          // For Volta, we assign the remaining shared memory to L1 cache
-          // For more info about adaptive cache, see
-          // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-          // assert(gpgpu_shmem_size == 98304); //Volta has 96 KB shared
-
-          // To Do: make it flexible and not tuned to 9KB share memory
-          unsigned max_assoc = m_L1D_config.get_max_assoc();
-          if (total_shmed == 0)
-            m_L1D_config.set_assoc(max_assoc);  // L1 is 128KB and shd=0
-          else if (total_shmed > 0 && total_shmed <= 8192)
-            m_L1D_config.set_assoc(0.9375 *
-                                   max_assoc);  // L1 is 120KB and shd=8KB
-          else if (total_shmed > 8192 && total_shmed <= 16384)
-            m_L1D_config.set_assoc(0.875 *
-                                   max_assoc);  // L1 is 112KB and shd=16KB
-          else if (total_shmed > 16384 && total_shmed <= 32768)
-            m_L1D_config.set_assoc(0.75 * max_assoc);  // L1 is 96KB and
-                                                       // shd=32KB
-          else if (total_shmed > 32768 && total_shmed <= 65536)
-            m_L1D_config.set_assoc(0.5 * max_assoc);  // L1 is 64KB and shd=64KB
-          else if (total_shmed > 65536 && total_shmed <= gpgpu_shmem_size)
-            m_L1D_config.set_assoc(0.25 * max_assoc);  // L1 is 32KB and
-                                                       // shd=96KB
-          else
-            assert(0);
-          break;
-        }
-        default:
-          assert(0);
+    unsigned total_shmem = kernel_info->smem * result;
+    assert(total_shmem >= 0 && total_shmem <= shmem_opt_list.back());
+
+    // Unified cache config is in KB. Converting to B
+    unsigned total_unified = m_L1D_config.m_unified_cache_size * 1024;
+
+    bool l1d_configured = false;
+    unsigned max_assoc = m_L1D_config.get_max_assoc();
+
+    for (std::vector<unsigned>::const_iterator it = shmem_opt_list.begin();
+         it < shmem_opt_list.end(); it++) {
+      if (total_shmem <= *it) {
+        float l1_ratio = 1 - ((float)*(it) / total_unified);
+        // make sure the ratio is between 0 and 1
+        assert(0 <= l1_ratio && l1_ratio <= 1);
+        // round to nearest instead of round down
+        m_L1D_config.set_assoc(max_assoc * l1_ratio + 0.5f);
+        l1d_configured = true;
+        break;
       }
+    }
 
-      printf("GPGPU-Sim: Reconfigure L1 cache to %uKB\n",
-             m_L1D_config.get_total_size_inKB());
+    assert(l1d_configured && "no shared memory option found");
+
+    if (m_L1D_config.is_streaming()) {
+      // for streaming cache, if the whole memory is allocated
+      // to the L1 cache, then make the allocation to be on_MISS
+      // otherwise, make it ON_FILL to eliminate line allocation fails
+      // i.e. MSHR throughput is the same, independent on the L1 cache
+      // size/associativity
+      if (total_shmem == 0) {
+        m_L1D_config.set_allocation_policy(ON_MISS);
+        printf("GPGPU-Sim: Reconfigure L1 allocation to ON_MISS\n");
+      } else {
+        m_L1D_config.set_allocation_policy(ON_FILL);
+        printf("GPGPU-Sim: Reconfigure L1 allocation to ON_FILL\n");
+      }
     }
+    printf("GPGPU-Sim: Reconfigure L1 cache to %uKB\n",
+           m_L1D_config.get_total_size_inKB());
 
     k.cache_config_set = true;
   }
@@ -3361,7 +3669,7 @@ void shader_core_ctx::cycle() {
   execute();
   read_operands();
   issue();
-  for (int i = 0; i < m_config->inst_fetch_throughput; ++i) {
+  for (unsigned int i = 0; i < m_config->inst_fetch_throughput; ++i) {
     decode();
     fetch();
   }
@@ -3726,7 +4034,8 @@ void shader_core_ctx::accept_ldst_unit_response(mem_fetch *mf) {
 
 void shader_core_ctx::store_ack(class mem_fetch *mf) {
   assert(mf->get_type() == WRITE_ACK ||
-         (m_config->gpgpu_perfect_mem && mf->get_is_write()));
+         ((m_config->gpgpu_perfect_mem || m_memory_config->SST_mode) &&
+          mf->get_is_write()));
   unsigned warp_id = mf->get_wid();
   m_warp[warp_id]->dec_store_req();
 }
@@ -3761,6 +4070,10 @@ void shader_core_ctx::get_icnt_power_stats(long &n_simt_to_mem,
   n_mem_to_simt += m_stats->n_mem_to_simt[m_sid];
 }
 
+kernel_info_t *shd_warp_t::get_kernel_info() const {
+  return m_shader->get_kernel_info();
+}
+
 bool shd_warp_t::functional_done() const {
   return get_n_completed() == m_warp_size;
 }
@@ -3786,13 +4099,16 @@ bool shd_warp_t::waiting() {
     // the functional execution of the atomic when it hits DRAM can cause
     // the wrong register to be read.
     return true;
+  } else if (m_waiting_ldgsts) {  // Waiting for LDGSTS to finish
+    return true;
   }
   return false;
 }
 
 void shd_warp_t::print(FILE *fout) const {
   if (!done_exit()) {
-    fprintf(fout, "w%02u npc: 0x%04x, done:%c%c%c%c:%2u i:%u s:%u a:%u (done: ",
+    fprintf(fout,
+            "w%02u npc: 0x%04llx, done:%c%c%c%c:%2u i:%u s:%u a:%u (done: ",
             m_warp_id, m_next_pc, (functional_done() ? 'f' : ' '),
             (stores_done() ? 's' : ' '), (inst_in_pipeline() ? ' ' : 'i'),
             (done_exit() ? 'e' : ' '), n_completed, m_inst_in_pipeline,
@@ -3861,30 +4177,38 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
   // for( unsigned n=0; n<m_num_ports;n++ )
   //    m_dispatch_units[m_output[n]].init( m_num_collector_units[n] );
   m_num_banks = num_banks;
-  m_bank_warp_shift = 0;
   m_warp_size = shader->get_config()->warp_size;
-  m_bank_warp_shift = (unsigned)(int)(log(m_warp_size + 0.5) / log(2.0));
-  assert((m_bank_warp_shift == 5) || (m_warp_size != 32));
 
   sub_core_model = shader->get_config()->sub_core_model;
-  m_num_warp_sceds = shader->get_config()->gpgpu_num_sched_per_core;
-  if (sub_core_model)
+  m_num_warp_scheds = shader->get_config()->gpgpu_num_sched_per_core;
+  unsigned reg_id = 0;
+  if (sub_core_model) {
     assert(num_banks % shader->get_config()->gpgpu_num_sched_per_core == 0);
+    assert(m_num_warp_scheds <= m_cu.size() &&
+           m_cu.size() % m_num_warp_scheds == 0);
+  }
   m_num_banks_per_sched =
       num_banks / shader->get_config()->gpgpu_num_sched_per_core;
 
   for (unsigned j = 0; j < m_cu.size(); j++) {
-    m_cu[j]->init(j, num_banks, m_bank_warp_shift, shader->get_config(), this,
-                  sub_core_model, m_num_banks_per_sched);
+    if (sub_core_model) {
+      unsigned cusPerSched = m_cu.size() / m_num_warp_scheds;
+      reg_id = j / cusPerSched;
+    }
+    m_cu[j]->init(j, num_banks, shader->get_config(), this, sub_core_model,
+                  reg_id, m_num_banks_per_sched);
+  }
+  for (unsigned j = 0; j < m_dispatch_units.size(); j++) {
+    m_dispatch_units[j].init(sub_core_model, m_num_warp_scheds);
   }
   m_initialized = true;
 }
 
-int register_bank(int regnum, int wid, unsigned num_banks,
-                  unsigned bank_warp_shift, bool sub_core_model,
-                  unsigned banks_per_sched, unsigned sched_id) {
+unsigned register_bank(int regnum, int wid, unsigned num_banks,
+                       bool sub_core_model, unsigned banks_per_sched,
+                       unsigned sched_id) {
   int bank = regnum;
-  if (bank_warp_shift) bank += wid;
+  bank += wid;
   if (sub_core_model) {
     unsigned bank_num = (bank % banks_per_sched) + (sched_id * banks_per_sched);
     assert(bank_num < num_banks);
@@ -3895,19 +4219,19 @@ int register_bank(int regnum, int wid, unsigned num_banks,
 
 bool opndcoll_rfu_t::writeback(warp_inst_t &inst) {
   assert(!inst.empty());
+
   std::list<unsigned> regs = m_shader->get_regs_written(inst);
   for (unsigned op = 0; op < MAX_REG_OPERANDS; op++) {
     int reg_num = inst.arch_reg.dst[op];  // this math needs to match that used
                                           // in function_info::ptx_decode_inst
     if (reg_num >= 0) {                   // valid register
-      unsigned bank = register_bank(reg_num, inst.warp_id(), m_num_banks,
-                                    m_bank_warp_shift, sub_core_model,
-                                    m_num_banks_per_sched, inst.get_schd_id());
+      unsigned bank =
+          register_bank(reg_num, inst.warp_id(), m_num_banks, sub_core_model,
+                        m_num_banks_per_sched, inst.get_schd_id());
       if (m_arbiter.bank_idle(bank)) {
         m_arbiter.allocate_bank_for_write(
-            bank,
-            op_t(&inst, reg_num, m_num_banks, m_bank_warp_shift, sub_core_model,
-                 m_num_banks_per_sched, inst.get_schd_id()));
+            bank, op_t(&inst, reg_num, m_num_banks, sub_core_model,
+                       m_num_banks_per_sched, inst.get_schd_id()));
         inst.arch_reg.dst[op] = -1;
       } else {
         return false;
@@ -3974,7 +4298,22 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
       for (unsigned j = 0; j < inp.m_cu_sets.size(); j++) {
         std::vector<collector_unit_t> &cu_set = m_cus[inp.m_cu_sets[j]];
         bool allocated = false;
-        for (unsigned k = 0; k < cu_set.size(); k++) {
+        unsigned cuLowerBound = 0;
+        unsigned cuUpperBound = cu_set.size();
+        unsigned schd_id;
+        if (sub_core_model) {
+          // Sub core model only allocates on the subset of CUs assigned to the
+          // scheduler that issued
+          unsigned reg_id = (*inp.m_in[i]).get_ready_reg_id();
+          schd_id = (*inp.m_in[i]).get_schd_id(reg_id);
+          assert(cu_set.size() % m_num_warp_scheds == 0 &&
+                 cu_set.size() >= m_num_warp_scheds);
+          unsigned cusPerSched = cu_set.size() / m_num_warp_scheds;
+          cuLowerBound = schd_id * cusPerSched;
+          cuUpperBound = cuLowerBound + cusPerSched;
+          assert(0 <= cuLowerBound && cuUpperBound <= cu_set.size());
+        }
+        for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
@@ -3984,8 +4323,9 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         }
         if (allocated) break;  // cu has been allocated, no need to search more.
       }
-      break;  // can only service a single input, if it failed it will fail for
-              // others.
+      // break;  // can only service a single input, if it failed it will fail
+      // for
+      // others.
     }
   }
 }
@@ -3999,9 +4339,8 @@ void opndcoll_rfu_t::allocate_reads() {
     const op_t &rr = *r;
     unsigned reg = rr.get_reg();
     unsigned wid = rr.get_wid();
-    unsigned bank =
-        register_bank(reg, wid, m_num_banks, m_bank_warp_shift, sub_core_model,
-                      m_num_banks_per_sched, rr.get_sid());
+    unsigned bank = register_bank(reg, wid, m_num_banks, sub_core_model,
+                                  m_num_banks_per_sched, rr.get_sid());
     m_arbiter.allocate_for_read(bank, rr);
     read_ops[bank] = rr;
   }
@@ -4032,7 +4371,8 @@ void opndcoll_rfu_t::allocate_reads() {
 }
 
 bool opndcoll_rfu_t::collector_unit_t::ready() const {
-  return (!m_free) && m_not_ready.none() && (*m_output_register).has_free();
+  return (!m_free) && m_not_ready.none() &&
+         (*m_output_register).has_free(m_sub_core_model, m_reg_id);
 }
 
 void opndcoll_rfu_t::collector_unit_t::dump(
@@ -4051,18 +4391,18 @@ void opndcoll_rfu_t::collector_unit_t::dump(
 }
 
 void opndcoll_rfu_t::collector_unit_t::init(unsigned n, unsigned num_banks,
-                                            unsigned log2_warp_size,
                                             const core_config *config,
                                             opndcoll_rfu_t *rfu,
                                             bool sub_core_model,
+                                            unsigned reg_id,
                                             unsigned banks_per_sched) {
   m_rfu = rfu;
   m_cuid = n;
   m_num_banks = num_banks;
   assert(m_warp == NULL);
   m_warp = new warp_inst_t(config);
-  m_bank_warp_shift = log2_warp_size;
   m_sub_core_model = sub_core_model;
+  m_reg_id = reg_id;
   m_num_banks_per_sched = banks_per_sched;
 }
 
@@ -4075,15 +4415,21 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
   warp_inst_t **pipeline_reg = pipeline_reg_set->get_ready();
   if ((pipeline_reg) and !((*pipeline_reg)->empty())) {
     m_warp_id = (*pipeline_reg)->warp_id();
+    std::vector<int> prev_regs;  // remove duplicate regs within same instr
     for (unsigned op = 0; op < MAX_REG_OPERANDS; op++) {
       int reg_num =
           (*pipeline_reg)
               ->arch_reg.src[op];  // this math needs to match that used in
                                    // function_info::ptx_decode_inst
-      if (reg_num >= 0) {          // valid register
-        m_src_op[op] = op_t(this, op, reg_num, m_num_banks, m_bank_warp_shift,
-                            m_sub_core_model, m_num_banks_per_sched,
-                            (*pipeline_reg)->get_schd_id());
+      bool new_reg = true;
+      for (auto r : prev_regs) {
+        if (r == reg_num) new_reg = false;
+      }
+      if (reg_num >= 0 && new_reg) {  // valid register
+        prev_regs.push_back(reg_num);
+        m_src_op[op] =
+            op_t(this, op, reg_num, m_num_banks, m_sub_core_model,
+                 m_num_banks_per_sched, (*pipeline_reg)->get_schd_id());
         m_not_ready.set(op);
       } else
         m_src_op[op] = op_t();
@@ -4097,8 +4443,7 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
 
 void opndcoll_rfu_t::collector_unit_t::dispatch() {
   assert(m_not_ready.none());
-  // move_warp(*m_output_register,m_warp);
-  m_output_register->move_in(m_warp);
+  m_output_register->move_in(m_sub_core_model, m_reg_id, m_warp);
   m_free = true;
   m_output_register = NULL;
   for (unsigned i = 0; i < MAX_REG_OPERANDS * 2; i++) m_src_op[i].reset();
@@ -4243,7 +4588,46 @@ bool simt_core_cluster::icnt_injection_buffer_full(unsigned size, bool write) {
   return !::icnt_has_buffer(m_cluster_id, request_size);
 }
 
+bool sst_simt_core_cluster::SST_injection_buffer_full(unsigned size, bool write,
+                                                      mem_access_type type) {
+  switch (type) {
+    case CONST_ACC_R:
+    case INST_ACC_R: {
+      return response_queue_full();
+      break;
+    }
+    default: {
+      return ::is_SST_buffer_full(m_cluster_id);
+      break;
+    }
+  }
+}
+
 void simt_core_cluster::icnt_inject_request_packet(class mem_fetch *mf) {
+  // Update stats based on mf type
+  update_icnt_stats(mf);
+
+  // The packet size varies depending on the type of request:
+  // - For write request and atomic request, the packet contains the data
+  // - For read request (i.e. not write nor atomic), the packet only has control
+  // metadata
+  unsigned int packet_size = mf->size();
+  if (!mf->get_is_write() && !mf->isatomic()) {
+    packet_size = mf->get_ctrl_size();
+  }
+  m_stats->m_outgoing_traffic_stats->record_traffic(mf, packet_size);
+  unsigned destination = mf->get_sub_partition_id();
+  mf->set_status(IN_ICNT_TO_MEM,
+                 m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+  if (!mf->get_is_write() && !mf->isatomic())
+    ::icnt_push(m_cluster_id, m_config->mem2device(destination), (void *)mf,
+                mf->get_ctrl_size());
+  else
+    ::icnt_push(m_cluster_id, m_config->mem2device(destination), (void *)mf,
+                mf->size());
+}
+
+void simt_core_cluster::update_icnt_stats(class mem_fetch *mf) {
   // stats
   if (mf->get_is_write())
     m_stats->made_write_mfs++;
@@ -4288,6 +4672,12 @@ void simt_core_cluster::icnt_inject_request_packet(class mem_fetch *mf) {
     default:
       assert(0);
   }
+}
+
+void sst_simt_core_cluster::icnt_inject_request_packet_to_SST(
+    class mem_fetch *mf) {
+  // Update stats
+  update_icnt_stats(mf);
 
   // The packet size varies depending on the type of request:
   // - For write request and atomic request, the packet contains the data
@@ -4298,15 +4688,25 @@ void simt_core_cluster::icnt_inject_request_packet(class mem_fetch *mf) {
     packet_size = mf->get_ctrl_size();
   }
   m_stats->m_outgoing_traffic_stats->record_traffic(mf, packet_size);
-  unsigned destination = mf->get_sub_partition_id();
   mf->set_status(IN_ICNT_TO_MEM,
                  m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
-  if (!mf->get_is_write() && !mf->isatomic())
-    ::icnt_push(m_cluster_id, m_config->mem2device(destination), (void *)mf,
-                mf->get_ctrl_size());
-  else
-    ::icnt_push(m_cluster_id, m_config->mem2device(destination), (void *)mf,
-                mf->size());
+  switch (mf->get_access_type()) {
+    case CONST_ACC_R:
+    case INST_ACC_R: {
+      push_response_fifo(mf);
+      break;
+    }
+    default: {
+      if (!mf->get_is_write() && !mf->isatomic())
+        ::send_read_request_SST(m_cluster_id, mf->get_addr(),
+                                mf->get_data_size(), (void *)mf);
+      else
+        ::send_write_request_SST(m_cluster_id, mf->get_addr(),
+                                 mf->get_data_size(), (void *)mf);
+
+      break;
+    }
+  }
 }
 
 void simt_core_cluster::icnt_cycle() {
@@ -4348,6 +4748,49 @@ void simt_core_cluster::icnt_cycle() {
   }
 }
 
+void sst_simt_core_cluster::icnt_cycle_SST() {
+  if (!m_response_fifo.empty()) {
+    mem_fetch *mf = m_response_fifo.front();
+    unsigned cid = m_config->sid_to_cid(mf->get_sid());
+    if (mf->get_access_type() == INST_ACC_R) {
+      // instruction fetch response
+      if (!m_core[cid]->fetch_unit_response_buffer_full()) {
+        m_response_fifo.pop_front();
+        m_core[cid]->accept_fetch_response(mf);
+      }
+    } else {
+      // data response
+      if (!m_core[cid]->ldst_unit_response_buffer_full()) {
+        m_response_fifo.pop_front();
+        m_memory_stats->memlatstat_read_done(mf);
+        m_core[cid]->accept_ldst_unit_response(mf);
+      }
+    }
+  }
+
+  // pop from SST buffers
+  if (m_response_fifo.size() < m_config->n_simt_ejection_buffer_size) {
+    mem_fetch *mf = (mem_fetch *)(static_cast<sst_gpgpu_sim *>(get_gpu())
+                                      ->SST_pop_mem_reply(m_cluster_id));
+    if (!mf) return;
+    assert(mf->get_tpc() == m_cluster_id);
+
+    // do atomic here
+    // For now, we execute atomic when the mem reply comes back
+    // This needs to be validated
+    if (mf && mf->isatomic()) mf->do_atomic();
+
+    unsigned int packet_size =
+        (mf->get_is_write()) ? mf->get_ctrl_size() : mf->size();
+    m_stats->m_incoming_traffic_stats->record_traffic(mf, packet_size);
+    mf->set_status(IN_CLUSTER_TO_SHADER_QUEUE,
+                   m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+    // m_memory_stats->memlatstat_read_done(mf,m_shader_config->max_warps_per_shader);
+    m_response_fifo.push_back(mf);
+    m_stats->n_mem_to_simt[m_cluster_id] += mf->get_num_flits(false);
+  }
+}
+
 void simt_core_cluster::get_pdom_stack_top_info(unsigned sid, unsigned tid,
                                                 unsigned *pc,
                                                 unsigned *rpc) const {
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 6481790bc..ee10af664 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1,19 +1,22 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Andrew Turner,
-// Ali Bakhoda
-// The University of British Columbia
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Andrew Turner,
+// Ali Bakhoda, Vijay Kandiah, Nikos Hardavellas,
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -117,14 +120,30 @@ class shd_warp_t {
     m_done_exit = true;
     m_last_fetch = 0;
     m_next = 0;
+    m_streamID = (unsigned long long)-1;
 
     // Jin: cdp support
     m_cdp_latency = 0;
     m_cdp_dummy = false;
+
+    // Ni: Initialize ldgdepbar_id
+    m_ldgdepbar_id = 0;
+    m_depbar_start_id = 0;
+    m_depbar_group = 0;
+
+    // Ni: Set waiting to false
+    m_waiting_ldgsts = false;
+
+    // Ni: Clear m_ldgdepbar_buf
+    for (unsigned i = 0; i < m_ldgdepbar_buf.size(); i++) {
+      m_ldgdepbar_buf[i].clear();
+    }
+    m_ldgdepbar_buf.clear();
   }
   void init(address_type start_pc, unsigned cta_id, unsigned wid,
-            const std::bitset<MAX_WARP_SIZE> &active,
-            unsigned dynamic_warp_id) {
+            const std::bitset<MAX_WARP_SIZE> &active, unsigned dynamic_warp_id,
+            unsigned long long streamID) {
+    m_streamID = streamID;
     m_cta_id = cta_id;
     m_warp_id = wid;
     m_dynamic_warp_id = dynamic_warp_id;
@@ -138,6 +157,20 @@ class shd_warp_t {
     // Jin: cdp support
     m_cdp_latency = 0;
     m_cdp_dummy = false;
+
+    // Ni: Initialize ldgdepbar_id
+    m_ldgdepbar_id = 0;
+    m_depbar_start_id = 0;
+    m_depbar_group = 0;
+
+    // Ni: Set waiting to false
+    m_waiting_ldgsts = false;
+
+    // Ni: Clear m_ldgdepbar_buf
+    for (unsigned i = 0; i < m_ldgdepbar_buf.size(); i++) {
+      m_ldgdepbar_buf[i].clear();
+    }
+    m_ldgdepbar_buf.clear();
   }
 
   bool functional_done() const;
@@ -169,6 +202,7 @@ class shd_warp_t {
   void clear_membar() { m_membar = false; }
   bool get_membar() const { return m_membar; }
   virtual address_type get_pc() const { return m_next_pc; }
+  virtual kernel_info_t *get_kernel_info() const;
   void set_next_pc(address_type pc) { m_next_pc = pc; }
 
   void store_info_of_last_inst_at_barrier(const warp_inst_t *pI) {
@@ -233,15 +267,20 @@ class shd_warp_t {
     m_inst_in_pipeline--;
   }
 
+  unsigned long long get_streamID() const { return m_streamID; }
   unsigned get_cta_id() const { return m_cta_id; }
 
   unsigned get_dynamic_warp_id() const { return m_dynamic_warp_id; }
   unsigned get_warp_id() const { return m_warp_id; }
 
-  class shader_core_ctx * get_shader() { return m_shader; }
+  class shader_core_ctx *get_shader() {
+    return m_shader;
+  }
+
  private:
   static const unsigned IBUFFER_SIZE = 2;
   class shader_core_ctx *m_shader;
+  unsigned long long m_streamID;
   unsigned m_cta_id;
   unsigned m_warp_id;
   unsigned m_warp_size;
@@ -282,6 +321,16 @@ class shd_warp_t {
  public:
   unsigned int m_cdp_latency;
   bool m_cdp_dummy;
+
+  // Ni: LDGDEPBAR barrier support
+ public:
+  unsigned int m_ldgdepbar_id;  // LDGDEPBAR barrier ID
+  std::vector<std::vector<warp_inst_t>>
+      m_ldgdepbar_buf;  // LDGDEPBAR barrier buffer
+  unsigned int m_depbar_start_id;
+  unsigned int m_depbar_group;
+  bool m_waiting_ldgsts;  // Ni: Whether the warp is waiting for the LDGSTS
+                          // instrs to finish
 };
 
 inline unsigned hw_tid_from_wid(unsigned wid, unsigned warp_size, unsigned i) {
@@ -294,9 +343,9 @@ inline unsigned wid_from_hw_tid(unsigned tid, unsigned warp_size) {
 const unsigned WARP_PER_CTA_MAX = 64;
 typedef std::bitset<WARP_PER_CTA_MAX> warp_set_t;
 
-int register_bank(int regnum, int wid, unsigned num_banks,
-                  unsigned bank_warp_shift, bool sub_core_model,
-                  unsigned banks_per_sched, unsigned sched_id);
+unsigned register_bank(int regnum, int wid, unsigned num_banks,
+                       bool sub_core_model, unsigned banks_per_sched,
+                       unsigned sched_id);
 
 class shader_core_ctx;
 class shader_core_config;
@@ -318,6 +367,7 @@ enum concrete_scheduler {
   CONCRETE_SCHEDULER_LRR = 0,
   CONCRETE_SCHEDULER_GTO,
   CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE,
+  CONCRETE_SCHEDULER_RRR,
   CONCRETE_SCHEDULER_WARP_LIMITING,
   CONCRETE_SCHEDULER_OLDEST_FIRST,
   NUM_CONCRETE_SCHEDULERS
@@ -344,8 +394,8 @@ class scheduler_unit {  // this can be copied freely, so can be used in std
         m_sfu_out(sfu_out),
         m_int_out(int_out),
         m_tensor_core_out(tensor_core_out),
-        m_spec_cores_out(spec_cores_out),
         m_mem_out(mem_out),
+        m_spec_cores_out(spec_cores_out),
         m_id(id) {}
   virtual ~scheduler_unit() {}
   virtual void add_supervised_warp_id(int i) {
@@ -369,6 +419,12 @@ class scheduler_unit {  // this can be copied freely, so can be used in std
       const typename std::vector<T> &input_list,
       const typename std::vector<T>::const_iterator &last_issued_from_input,
       unsigned num_warps_to_add);
+  template <typename T>
+  void order_rrr(
+      typename std::vector<T> &result_list,
+      const typename std::vector<T> &input_list,
+      const typename std::vector<T>::const_iterator &last_issued_from_input,
+      unsigned num_warps_to_add);
 
   enum OrderingType {
     // The item that issued last is prioritized first then the sorted result
@@ -427,6 +483,8 @@ class scheduler_unit {  // this can be copied freely, so can be used in std
   register_set *m_tensor_core_out;
   register_set *m_mem_out;
   std::vector<register_set *> &m_spec_cores_out;
+  unsigned m_num_issued_last_cycle;
+  unsigned m_current_turn_warp;
 
   int m_id;
 };
@@ -450,6 +508,25 @@ class lrr_scheduler : public scheduler_unit {
   }
 };
 
+class rrr_scheduler : public scheduler_unit {
+ public:
+  rrr_scheduler(shader_core_stats *stats, shader_core_ctx *shader,
+                Scoreboard *scoreboard, simt_stack **simt,
+                std::vector<shd_warp_t *> *warp, register_set *sp_out,
+                register_set *dp_out, register_set *sfu_out,
+                register_set *int_out, register_set *tensor_core_out,
+                std::vector<register_set *> &spec_cores_out,
+                register_set *mem_out, int id)
+      : scheduler_unit(stats, shader, scoreboard, simt, warp, sp_out, dp_out,
+                       sfu_out, int_out, tensor_core_out, spec_cores_out,
+                       mem_out, id) {}
+  virtual ~rrr_scheduler() {}
+  virtual void order_warps();
+  virtual void done_adding_supervised_warps() {
+    m_last_supervised_issued = m_supervised_warps.end();
+  }
+};
+
 class gto_scheduler : public scheduler_unit {
  public:
   gto_scheduler(shader_core_stats *stats, shader_core_ctx *shader,
@@ -611,28 +688,26 @@ class opndcoll_rfu_t {  // operand collector based register file unit
    public:
     op_t() { m_valid = false; }
     op_t(collector_unit_t *cu, unsigned op, unsigned reg, unsigned num_banks,
-         unsigned bank_warp_shift, bool sub_core_model,
-         unsigned banks_per_sched, unsigned sched_id) {
+         bool sub_core_model, unsigned banks_per_sched, unsigned sched_id) {
       m_valid = true;
       m_warp = NULL;
       m_cu = cu;
       m_operand = op;
       m_register = reg;
       m_shced_id = sched_id;
-      m_bank = register_bank(reg, cu->get_warp_id(), num_banks, bank_warp_shift,
-                             sub_core_model, banks_per_sched, sched_id);
+      m_bank = register_bank(reg, cu->get_warp_id(), num_banks, sub_core_model,
+                             banks_per_sched, sched_id);
     }
     op_t(const warp_inst_t *warp, unsigned reg, unsigned num_banks,
-         unsigned bank_warp_shift, bool sub_core_model,
-         unsigned banks_per_sched, unsigned sched_id) {
+         bool sub_core_model, unsigned banks_per_sched, unsigned sched_id) {
       m_valid = true;
       m_warp = warp;
       m_register = reg;
       m_cu = NULL;
       m_operand = -1;
       m_shced_id = sched_id;
-      m_bank = register_bank(reg, warp->warp_id(), num_banks, bank_warp_shift,
-                             sub_core_model, banks_per_sched, sched_id);
+      m_bank = register_bank(reg, warp->warp_id(), num_banks, sub_core_model,
+                             banks_per_sched, sched_id);
     }
 
     // accessors
@@ -864,7 +939,6 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_not_ready.reset();
       m_warp_id = -1;
       m_num_banks = 0;
-      m_bank_warp_shift = 0;
     }
     // accessors
     bool ready() const;
@@ -878,11 +952,12 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     }
     unsigned get_sp_op() const { return m_warp->sp_op; }
     unsigned get_id() const { return m_cuid; }  // returns CU hw id
+    unsigned get_reg_id() const { return m_reg_id; }
 
     // modifiers
-    void init(unsigned n, unsigned num_banks, unsigned log2_warp_size,
-              const core_config *config, opndcoll_rfu_t *rfu,
-              bool m_sub_core_model, unsigned num_banks_per_sched);
+    void init(unsigned n, unsigned num_banks, const core_config *config,
+              opndcoll_rfu_t *rfu, bool m_sub_core_model, unsigned reg_id,
+              unsigned num_banks_per_sched);
     bool allocate(register_set *pipeline_reg, register_set *output_reg);
 
     void collect_operand(unsigned op) { m_not_ready.reset(op); }
@@ -901,11 +976,11 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     op_t *m_src_op;
     std::bitset<MAX_REG_OPERANDS * 2> m_not_ready;
     unsigned m_num_banks;
-    unsigned m_bank_warp_shift;
     opndcoll_rfu_t *m_rfu;
 
     unsigned m_num_banks_per_sched;
     bool m_sub_core_model;
+    unsigned m_reg_id;  // if sub_core_model enabled, limit regs this cu can r/w
   };
 
   class dispatch_unit_t {
@@ -916,10 +991,19 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_num_collectors = (*cus).size();
       m_next_cu = 0;
     }
+    void init(bool sub_core_model, unsigned num_warp_scheds) {
+      m_sub_core_model = sub_core_model;
+      m_num_warp_scheds = num_warp_scheds;
+    }
 
     collector_unit_t *find_ready() {
+      // With sub-core enabled round robin starts with the next cu assigned to a
+      // different sub-core than the one that dispatched last
+      unsigned cusPerSched = m_num_collectors / m_num_warp_scheds;
+      unsigned rr_increment =
+          m_sub_core_model ? cusPerSched - (m_last_cu % cusPerSched) : 1;
       for (unsigned n = 0; n < m_num_collectors; n++) {
-        unsigned c = (m_last_cu + n + 1) % m_num_collectors;
+        unsigned c = (m_last_cu + n + rr_increment) % m_num_collectors;
         if ((*m_collector_units)[c].ready()) {
           m_last_cu = c;
           return &((*m_collector_units)[c]);
@@ -933,6 +1017,8 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     std::vector<collector_unit_t> *m_collector_units;
     unsigned m_last_cu;  // dispatch ready cu's rr
     unsigned m_next_cu;  // for initialization
+    bool m_sub_core_model;
+    unsigned m_num_warp_scheds;
   };
 
   // opndcoll_rfu_t data members
@@ -941,13 +1027,12 @@ class opndcoll_rfu_t {  // operand collector based register file unit
   unsigned m_num_collector_sets;
   // unsigned m_num_collectors;
   unsigned m_num_banks;
-  unsigned m_bank_warp_shift;
   unsigned m_warp_size;
   std::vector<collector_unit_t *> m_cu;
   arbiter_t m_arbiter;
 
   unsigned m_num_banks_per_sched;
-  unsigned m_num_warp_sceds;
+  unsigned m_num_warp_scheds;
   bool sub_core_model;
 
   // unsigned m_num_ports;
@@ -1039,10 +1124,7 @@ class simd_function_unit {
   ~simd_function_unit() { delete m_dispatch_reg; }
 
   // modifiers
-  virtual void issue(register_set &source_reg) {
-    source_reg.move_out_to(m_dispatch_reg);
-    occupied.set(m_dispatch_reg->latency);
-  }
+  virtual void issue(register_set &source_reg);
   virtual void cycle() = 0;
   virtual void active_lanes_in_pipeline() = 0;
 
@@ -1051,6 +1133,8 @@ class simd_function_unit {
   virtual bool can_issue(const warp_inst_t &inst) const {
     return m_dispatch_reg->empty() && !occupied.test(inst.latency);
   }
+  virtual bool is_issue_partitioned() = 0;
+  virtual unsigned get_issue_reg_id() = 0;
   virtual bool stallable() const = 0;
   virtual void print(FILE *fp) const {
     fprintf(fp, "%s dispatch= ", m_name.c_str());
@@ -1070,7 +1154,7 @@ class pipelined_simd_unit : public simd_function_unit {
  public:
   pipelined_simd_unit(register_set *result_port,
                       const shader_core_config *config, unsigned max_latency,
-                      shader_core_ctx *core);
+                      shader_core_ctx *core, unsigned issue_reg_id);
 
   // modifiers
   virtual void cycle();
@@ -1091,6 +1175,8 @@ class pipelined_simd_unit : public simd_function_unit {
   virtual bool can_issue(const warp_inst_t &inst) const {
     return simd_function_unit::can_issue(inst);
   }
+  virtual bool is_issue_partitioned() = 0;
+  unsigned get_issue_reg_id() { return m_issue_reg_id; }
   virtual void print(FILE *fp) const {
     simd_function_unit::print(fp);
     for (int s = m_pipeline_depth - 1; s >= 0; s--) {
@@ -1106,6 +1192,8 @@ class pipelined_simd_unit : public simd_function_unit {
   warp_inst_t **m_pipeline_reg;
   register_set *m_result_port;
   class shader_core_ctx *m_core;
+  unsigned m_issue_reg_id;  // if sub_core_model is enabled we can only issue
+                            // from a subset of operand collectors
 
   unsigned active_insts_in_pipeline;
 };
@@ -1113,7 +1201,7 @@ class pipelined_simd_unit : public simd_function_unit {
 class sfu : public pipelined_simd_unit {
  public:
   sfu(register_set *result_port, const shader_core_config *config,
-      shader_core_ctx *core);
+      shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case SFU_OP:
@@ -1129,12 +1217,13 @@ class sfu : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class dp_unit : public pipelined_simd_unit {
  public:
   dp_unit(register_set *result_port, const shader_core_config *config,
-          shader_core_ctx *core);
+          shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case DP_OP:
@@ -1146,12 +1235,13 @@ class dp_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class tensor_core : public pipelined_simd_unit {
  public:
   tensor_core(register_set *result_port, const shader_core_config *config,
-              shader_core_ctx *core);
+              shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case TENSOR_CORE_OP:
@@ -1163,12 +1253,13 @@ class tensor_core : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class int_unit : public pipelined_simd_unit {
  public:
   int_unit(register_set *result_port, const shader_core_config *config,
-           shader_core_ctx *core);
+           shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case SFU_OP:
@@ -1194,12 +1285,13 @@ class int_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class sp_unit : public pipelined_simd_unit {
  public:
   sp_unit(register_set *result_port, const shader_core_config *config,
-          shader_core_ctx *core);
+          shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case SFU_OP:
@@ -1223,13 +1315,14 @@ class sp_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class specialized_unit : public pipelined_simd_unit {
  public:
   specialized_unit(register_set *result_port, const shader_core_config *config,
-                   shader_core_ctx *core, unsigned supported_op,
-                   char *unit_name, unsigned latency);
+                   shader_core_ctx *core, int supported_op, char *unit_name,
+                   unsigned latency, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     if (inst.op != m_supported_op) {
       return false;
@@ -1238,9 +1331,10 @@ class specialized_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 
  private:
-  unsigned m_supported_op;
+  int m_supported_op;
 };
 
 class simt_core_cluster;
@@ -1255,10 +1349,21 @@ class ldst_unit : public pipelined_simd_unit {
             shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
             Scoreboard *scoreboard, const shader_core_config *config,
             const memory_config *mem_config, class shader_core_stats *stats,
-            unsigned sid, unsigned tpc);
-
+            unsigned sid, unsigned tpc, gpgpu_sim *gpu);
+
+  // Add a structure to record the LDGSTS instructions,
+  // similar to m_pending_writes, but since LDGSTS does not have a output
+  // register to write to, so a new structure needs to be added
+  /* A multi-level map: unsigned (warp_id) -> unsigned (pc) -> unsigned (addr)
+   * -> unsigned (count)
+   */
+  std::map<unsigned /*warp_id*/,
+           std::map<unsigned /*pc*/,
+                    std::map<unsigned /*addr*/, unsigned /*count*/>>>
+      m_pending_ldgsts;
   // modifiers
   virtual void issue(register_set &inst);
+  bool is_issue_partitioned() { return false; }
   virtual void cycle();
 
   void fill(mem_fetch *mf);
@@ -1334,6 +1439,7 @@ class ldst_unit : public pipelined_simd_unit {
                                                    warp_inst_t &inst);
   mem_stage_stall_type process_memory_access_queue_l1cache(l1_cache *cache,
                                                            warp_inst_t &inst);
+  gpgpu_sim *m_gpu;
 
   const memory_config *m_memory_config;
   class mem_fetch_interface *m_icnt;
@@ -1479,6 +1585,17 @@ class shader_core_config : public core_config {
       } else
         break;  // we only accept continuous specialized_units, i.e., 1,2,3,4
     }
+
+    // parse gpgpu_shmem_option for adpative cache config
+    if (adaptive_cache_config) {
+      std::stringstream ss(gpgpu_shmem_option);
+      while (ss.good()) {
+        std::string option;
+        std::getline(ss, option, ',');
+        shmem_opt_list.push_back((unsigned)std::stoi(option) * 1024);
+      }
+      std::sort(shmem_opt_list.begin(), shmem_opt_list.end());
+    }
   }
   void reg_options(class OptionParser *opp);
   unsigned max_cta(const kernel_info_t &k) const;
@@ -1553,13 +1670,13 @@ class shader_core_config : public core_config {
   unsigned int gpgpu_operand_collector_num_out_ports_gen;
   unsigned int gpgpu_operand_collector_num_out_ports_int;
 
-  int gpgpu_num_sp_units;
-  int gpgpu_tensor_core_avail;
-  int gpgpu_num_dp_units;
-  int gpgpu_num_sfu_units;
-  int gpgpu_num_tensor_core_units;
-  int gpgpu_num_mem_units;
-  int gpgpu_num_int_units;
+  unsigned int gpgpu_num_sp_units;
+  unsigned int gpgpu_tensor_core_avail;
+  unsigned int gpgpu_num_dp_units;
+  unsigned int gpgpu_num_sfu_units;
+  unsigned int gpgpu_num_tensor_core_units;
+  unsigned int gpgpu_num_mem_units;
+  unsigned int gpgpu_num_int_units;
 
   // Shader core resources
   unsigned gpgpu_shader_registers;
@@ -1619,18 +1736,26 @@ struct shader_core_stats_pod {
   unsigned *m_num_INTdecoded_insn;
   unsigned *m_num_storequeued_insn;
   unsigned *m_num_loadqueued_insn;
-  unsigned *m_num_ialu_acesses;
-  unsigned *m_num_fp_acesses;
-  unsigned *m_num_imul_acesses;
   unsigned *m_num_tex_inst;
-  unsigned *m_num_fpmul_acesses;
-  unsigned *m_num_idiv_acesses;
-  unsigned *m_num_fpdiv_acesses;
-  unsigned *m_num_sp_acesses;
-  unsigned *m_num_sfu_acesses;
-  unsigned *m_num_tensor_core_acesses;
-  unsigned *m_num_trans_acesses;
-  unsigned *m_num_mem_acesses;
+  double *m_num_ialu_acesses;
+  double *m_num_fp_acesses;
+  double *m_num_imul_acesses;
+  double *m_num_fpmul_acesses;
+  double *m_num_idiv_acesses;
+  double *m_num_fpdiv_acesses;
+  double *m_num_sp_acesses;
+  double *m_num_sfu_acesses;
+  double *m_num_tensor_core_acesses;
+  double *m_num_tex_acesses;
+  double *m_num_const_acesses;
+  double *m_num_dp_acesses;
+  double *m_num_dpmul_acesses;
+  double *m_num_dpdiv_acesses;
+  double *m_num_sqrt_acesses;
+  double *m_num_log_acesses;
+  double *m_num_sin_acesses;
+  double *m_num_exp_acesses;
+  double *m_num_mem_acesses;
   unsigned *m_num_sp_committed;
   unsigned *m_num_tlb_hits;
   unsigned *m_num_tlb_accesses;
@@ -1640,13 +1765,15 @@ struct shader_core_stats_pod {
   unsigned *m_read_regfile_acesses;
   unsigned *m_write_regfile_acesses;
   unsigned *m_non_rf_operands;
-  unsigned *m_num_imul24_acesses;
-  unsigned *m_num_imul32_acesses;
+  double *m_num_imul24_acesses;
+  double *m_num_imul32_acesses;
   unsigned *m_active_sp_lanes;
   unsigned *m_active_sfu_lanes;
   unsigned *m_active_tensor_core_lanes;
   unsigned *m_active_fu_lanes;
   unsigned *m_active_fu_mem_lanes;
+  double *m_active_exu_threads;  // For power model
+  double *m_active_exu_warps;    // For power model
   unsigned *m_n_diverge;  // number of divergence occurring in this shader
   unsigned gpgpu_n_load_insn;
   unsigned gpgpu_n_store_insn;
@@ -1656,7 +1783,7 @@ struct shader_core_stats_pod {
   unsigned gpgpu_n_const_insn;
   unsigned gpgpu_n_param_insn;
   unsigned gpgpu_n_shmem_bkconflict;
-  unsigned gpgpu_n_cache_bkconflict;
+  unsigned gpgpu_n_l1cache_bkconflict;
   int gpgpu_n_intrawarp_mshr_merge;
   unsigned gpgpu_n_cmem_portconflict;
   unsigned gpu_stall_shd_mem_breakdown[N_MEM_STAGE_ACCESS_TYPE]
@@ -1717,35 +1844,38 @@ class shader_core_stats : public shader_core_stats_pod {
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_loadqueued_insn =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_INTdecoded_insn =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_ialu_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_fp_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_tex_inst = (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_imul_acesses =
+    m_num_INTdecoded_insn =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_num_ialu_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_fp_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_imul_acesses = (double *)calloc(config->num_shader(), sizeof(double));
     m_num_imul24_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_imul32_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_fpmul_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_idiv_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_idiv_acesses = (double *)calloc(config->num_shader(), sizeof(double));
     m_num_fpdiv_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_sp_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_sfu_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_dp_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_dpmul_acesses =
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_dpdiv_acesses =
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_sp_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_sfu_acesses = (double *)calloc(config->num_shader(), sizeof(double));
     m_num_tensor_core_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_trans_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_mem_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_const_acesses =
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_tex_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_sqrt_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_log_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_sin_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_exp_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_mem_acesses = (double *)calloc(config->num_shader(), sizeof(double));
     m_num_sp_committed =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_tlb_hits = (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
@@ -1759,6 +1889,9 @@ class shader_core_stats : public shader_core_stats_pod {
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_active_fu_lanes =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_active_exu_threads =
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_active_exu_warps = (double *)calloc(config->num_shader(), sizeof(double));
     m_active_fu_mem_lanes =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_sfu_committed =
@@ -1802,6 +1935,48 @@ class shader_core_stats : public shader_core_stats_pod {
     delete m_incoming_traffic_stats;
     free(m_num_sim_insn);
     free(m_num_sim_winsn);
+    free(m_num_FPdecoded_insn);
+    free(m_num_INTdecoded_insn);
+    free(m_num_storequeued_insn);
+    free(m_num_loadqueued_insn);
+    free(m_num_ialu_acesses);
+    free(m_num_fp_acesses);
+    free(m_num_imul_acesses);
+    free(m_num_tex_inst);
+    free(m_num_fpmul_acesses);
+    free(m_num_idiv_acesses);
+    free(m_num_fpdiv_acesses);
+    free(m_num_sp_acesses);
+    free(m_num_sfu_acesses);
+    free(m_num_tensor_core_acesses);
+    free(m_num_tex_acesses);
+    free(m_num_const_acesses);
+    free(m_num_dp_acesses);
+    free(m_num_dpmul_acesses);
+    free(m_num_dpdiv_acesses);
+    free(m_num_sqrt_acesses);
+    free(m_num_log_acesses);
+    free(m_num_sin_acesses);
+    free(m_num_exp_acesses);
+    free(m_num_mem_acesses);
+    free(m_num_sp_committed);
+    free(m_num_tlb_hits);
+    free(m_num_tlb_accesses);
+    free(m_num_sfu_committed);
+    free(m_num_tensor_core_committed);
+    free(m_num_mem_committed);
+    free(m_read_regfile_acesses);
+    free(m_write_regfile_acesses);
+    free(m_non_rf_operands);
+    free(m_num_imul24_acesses);
+    free(m_num_imul32_acesses);
+    free(m_active_sp_lanes);
+    free(m_active_sfu_lanes);
+    free(m_active_tensor_core_lanes);
+    free(m_active_fu_lanes);
+    free(m_active_exu_threads);
+    free(m_active_exu_warps);
+    free(m_active_fu_mem_lanes);
     free(m_n_diverge);
     free(shader_cycle_distro);
     free(last_shader_cycle_distro);
@@ -1840,6 +2015,7 @@ class shader_core_stats : public shader_core_stats_pod {
   friend class shader_core_ctx;
   friend class ldst_unit;
   friend class simt_core_cluster;
+  friend class sst_simt_core_cluster;
   friend class scheduler_unit;
   friend class TwoLevelScheduler;
   friend class LooseRoundRobbinScheduler;
@@ -1855,12 +2031,20 @@ class shader_core_mem_fetch_allocator : public mem_fetch_allocator {
     m_memory_config = config;
   }
   mem_fetch *alloc(new_addr_type addr, mem_access_type type, unsigned size,
-                   bool wr, unsigned long long cycle) const;
+                   bool wr, unsigned long long cycle,
+                   unsigned long long streamID) const;
+  mem_fetch *alloc(new_addr_type addr, mem_access_type type,
+                   const active_mask_t &active_mask,
+                   const mem_access_byte_mask_t &byte_mask,
+                   const mem_access_sector_mask_t &sector_mask, unsigned size,
+                   bool wr, unsigned long long cycle, unsigned wid,
+                   unsigned sid, unsigned tpc, mem_fetch *original_mf,
+                   unsigned long long streamID) const;
   mem_fetch *alloc(const warp_inst_t &inst, const mem_access_t &access,
                    unsigned long long cycle) const {
     warp_inst_t inst_copy = inst;
     mem_fetch *mf = new mem_fetch(
-        access, &inst_copy,
+        access, &inst_copy, inst.get_streamID(),
         access.is_write() ? WRITE_PACKET_SIZE : READ_PACKET_SIZE,
         inst.warp_id(), m_core_id, m_cluster_id, m_memory_config, cycle);
     return mf;
@@ -1900,7 +2084,7 @@ class shader_core_ctx : public core_t {
     printf("GPGPU-Sim uArch: Shader %d bind to kernel %u \'%s\'\n", m_sid,
            m_kernel->get_uid(), m_kernel->name().c_str());
   }
-
+  PowerscalingCoefficients *scaling_coeffs;
   // accessors
   bool fetch_unit_response_buffer_full() const;
   bool ldst_unit_response_buffer_full() const;
@@ -1919,6 +2103,9 @@ class shader_core_ctx : public core_t {
   // modifiers
   virtual void warp_exit(unsigned warp_id);
 
+  // Ni: Unset ldgdepbar
+  void unset_depbar(const warp_inst_t &inst);
+
   // accessors
   virtual bool warp_waiting_at_barrier(unsigned warp_id) const;
   void get_pdom_stack_top_info(unsigned tid, unsigned *pc, unsigned *rpc) const;
@@ -1961,116 +2148,241 @@ class shader_core_ctx : public core_t {
   void incialu_stat(unsigned active_count, double latency) {
     if (m_config->gpgpu_clock_gated_lanes == false) {
       m_stats->m_num_ialu_acesses[m_sid] =
-          m_stats->m_num_ialu_acesses[m_sid] + active_count * latency +
+          m_stats->m_num_ialu_acesses[m_sid] + (double)active_count * latency +
           inactive_lanes_accesses_nonsfu(active_count, latency);
     } else {
       m_stats->m_num_ialu_acesses[m_sid] =
-          m_stats->m_num_ialu_acesses[m_sid] + active_count * latency;
+          m_stats->m_num_ialu_acesses[m_sid] + (double)active_count * latency;
     }
-  }
-  void inctex_stat(unsigned active_count, double latency) {
-    m_stats->m_num_tex_inst[m_sid] =
-        m_stats->m_num_tex_inst[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
   void incimul_stat(unsigned active_count, double latency) {
     if (m_config->gpgpu_clock_gated_lanes == false) {
       m_stats->m_num_imul_acesses[m_sid] =
-          m_stats->m_num_imul_acesses[m_sid] + active_count * latency +
+          m_stats->m_num_imul_acesses[m_sid] + (double)active_count * latency +
           inactive_lanes_accesses_nonsfu(active_count, latency);
     } else {
       m_stats->m_num_imul_acesses[m_sid] =
-          m_stats->m_num_imul_acesses[m_sid] + active_count * latency;
+          m_stats->m_num_imul_acesses[m_sid] + (double)active_count * latency;
     }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
   void incimul24_stat(unsigned active_count, double latency) {
     if (m_config->gpgpu_clock_gated_lanes == false) {
       m_stats->m_num_imul24_acesses[m_sid] =
-          m_stats->m_num_imul24_acesses[m_sid] + active_count * latency +
+          m_stats->m_num_imul24_acesses[m_sid] +
+          (double)active_count * latency +
           inactive_lanes_accesses_nonsfu(active_count, latency);
     } else {
       m_stats->m_num_imul24_acesses[m_sid] =
-          m_stats->m_num_imul24_acesses[m_sid] + active_count * latency;
+          m_stats->m_num_imul24_acesses[m_sid] + (double)active_count * latency;
     }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
   void incimul32_stat(unsigned active_count, double latency) {
     if (m_config->gpgpu_clock_gated_lanes == false) {
       m_stats->m_num_imul32_acesses[m_sid] =
-          m_stats->m_num_imul32_acesses[m_sid] + active_count * latency +
+          m_stats->m_num_imul32_acesses[m_sid] +
+          (double)active_count * latency +
           inactive_lanes_accesses_sfu(active_count, latency);
     } else {
       m_stats->m_num_imul32_acesses[m_sid] =
-          m_stats->m_num_imul32_acesses[m_sid] + active_count * latency;
+          m_stats->m_num_imul32_acesses[m_sid] + (double)active_count * latency;
     }
-    // printf("Int_Mul -- Active_count: %d\n",active_count);
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
   void incidiv_stat(unsigned active_count, double latency) {
     if (m_config->gpgpu_clock_gated_lanes == false) {
       m_stats->m_num_idiv_acesses[m_sid] =
-          m_stats->m_num_idiv_acesses[m_sid] + active_count * latency +
+          m_stats->m_num_idiv_acesses[m_sid] + (double)active_count * latency +
           inactive_lanes_accesses_sfu(active_count, latency);
     } else {
       m_stats->m_num_idiv_acesses[m_sid] =
-          m_stats->m_num_idiv_acesses[m_sid] + active_count * latency;
+          m_stats->m_num_idiv_acesses[m_sid] + (double)active_count * latency;
     }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
   void incfpalu_stat(unsigned active_count, double latency) {
     if (m_config->gpgpu_clock_gated_lanes == false) {
       m_stats->m_num_fp_acesses[m_sid] =
-          m_stats->m_num_fp_acesses[m_sid] + active_count * latency +
+          m_stats->m_num_fp_acesses[m_sid] + (double)active_count * latency +
           inactive_lanes_accesses_nonsfu(active_count, latency);
     } else {
       m_stats->m_num_fp_acesses[m_sid] =
-          m_stats->m_num_fp_acesses[m_sid] + active_count * latency;
+          m_stats->m_num_fp_acesses[m_sid] + (double)active_count * latency;
     }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
   void incfpmul_stat(unsigned active_count, double latency) {
     // printf("FP MUL stat increament\n");
     if (m_config->gpgpu_clock_gated_lanes == false) {
       m_stats->m_num_fpmul_acesses[m_sid] =
-          m_stats->m_num_fpmul_acesses[m_sid] + active_count * latency +
+          m_stats->m_num_fpmul_acesses[m_sid] + (double)active_count * latency +
           inactive_lanes_accesses_nonsfu(active_count, latency);
     } else {
       m_stats->m_num_fpmul_acesses[m_sid] =
-          m_stats->m_num_fpmul_acesses[m_sid] + active_count * latency;
+          m_stats->m_num_fpmul_acesses[m_sid] + (double)active_count * latency;
     }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
   void incfpdiv_stat(unsigned active_count, double latency) {
     if (m_config->gpgpu_clock_gated_lanes == false) {
       m_stats->m_num_fpdiv_acesses[m_sid] =
-          m_stats->m_num_fpdiv_acesses[m_sid] + active_count * latency +
+          m_stats->m_num_fpdiv_acesses[m_sid] + (double)active_count * latency +
           inactive_lanes_accesses_sfu(active_count, latency);
     } else {
       m_stats->m_num_fpdiv_acesses[m_sid] =
-          m_stats->m_num_fpdiv_acesses[m_sid] + active_count * latency;
+          m_stats->m_num_fpdiv_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+  void incdpalu_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_dp_acesses[m_sid] =
+          m_stats->m_num_dp_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_nonsfu(active_count, latency);
+    } else {
+      m_stats->m_num_dp_acesses[m_sid] =
+          m_stats->m_num_dp_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+  void incdpmul_stat(unsigned active_count, double latency) {
+    // printf("FP MUL stat increament\n");
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_dpmul_acesses[m_sid] =
+          m_stats->m_num_dpmul_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_nonsfu(active_count, latency);
+    } else {
+      m_stats->m_num_dpmul_acesses[m_sid] =
+          m_stats->m_num_dpmul_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+  void incdpdiv_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_dpdiv_acesses[m_sid] =
+          m_stats->m_num_dpdiv_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_dpdiv_acesses[m_sid] =
+          m_stats->m_num_dpdiv_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+
+  void incsqrt_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_sqrt_acesses[m_sid] =
+          m_stats->m_num_sqrt_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_sqrt_acesses[m_sid] =
+          m_stats->m_num_sqrt_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+
+  void inclog_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_log_acesses[m_sid] =
+          m_stats->m_num_log_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_log_acesses[m_sid] =
+          m_stats->m_num_log_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+
+  void incexp_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_exp_acesses[m_sid] =
+          m_stats->m_num_exp_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_exp_acesses[m_sid] =
+          m_stats->m_num_exp_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+
+  void incsin_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_sin_acesses[m_sid] =
+          m_stats->m_num_sin_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_sin_acesses[m_sid] =
+          m_stats->m_num_sin_acesses[m_sid] + (double)active_count * latency;
     }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
-  void inctrans_stat(unsigned active_count, double latency) {
+
+  void inctensor_stat(unsigned active_count, double latency) {
     if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_trans_acesses[m_sid] =
-          m_stats->m_num_trans_acesses[m_sid] + active_count * latency +
+      m_stats->m_num_tensor_core_acesses[m_sid] =
+          m_stats->m_num_tensor_core_acesses[m_sid] +
+          (double)active_count * latency +
           inactive_lanes_accesses_sfu(active_count, latency);
     } else {
-      m_stats->m_num_trans_acesses[m_sid] =
-          m_stats->m_num_trans_acesses[m_sid] + active_count * latency;
+      m_stats->m_num_tensor_core_acesses[m_sid] =
+          m_stats->m_num_tensor_core_acesses[m_sid] +
+          (double)active_count * latency;
     }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+
+  void inctex_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_tex_acesses[m_sid] =
+          m_stats->m_num_tex_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_tex_acesses[m_sid] =
+          m_stats->m_num_tex_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+
+  void inc_const_accesses(unsigned active_count) {
+    m_stats->m_num_const_acesses[m_sid] =
+        m_stats->m_num_const_acesses[m_sid] + active_count;
   }
 
   void incsfu_stat(unsigned active_count, double latency) {
     m_stats->m_num_sfu_acesses[m_sid] =
-        m_stats->m_num_sfu_acesses[m_sid] + active_count * latency;
+        m_stats->m_num_sfu_acesses[m_sid] + (double)active_count * latency;
   }
   void incsp_stat(unsigned active_count, double latency) {
     m_stats->m_num_sp_acesses[m_sid] =
-        m_stats->m_num_sp_acesses[m_sid] + active_count * latency;
+        m_stats->m_num_sp_acesses[m_sid] + (double)active_count * latency;
   }
   void incmem_stat(unsigned active_count, double latency) {
     if (m_config->gpgpu_clock_gated_lanes == false) {
       m_stats->m_num_mem_acesses[m_sid] =
-          m_stats->m_num_mem_acesses[m_sid] + active_count * latency +
+          m_stats->m_num_mem_acesses[m_sid] + (double)active_count * latency +
           inactive_lanes_accesses_nonsfu(active_count, latency);
     } else {
       m_stats->m_num_mem_acesses[m_sid] =
-          m_stats->m_num_mem_acesses[m_sid] + active_count * latency;
+          m_stats->m_num_mem_acesses[m_sid] + (double)active_count * latency;
     }
   }
   void incexecstat(warp_inst_t *&inst);
@@ -2133,8 +2445,8 @@ class shader_core_ctx : public core_t {
   friend class TwoLevelScheduler;
   friend class LooseRoundRobbinScheduler;
   virtual void issue_warp(register_set &warp, const warp_inst_t *pI,
-                  const active_mask_t &active_mask, unsigned warp_id,
-                  unsigned sch_id);
+                          const active_mask_t &active_mask, unsigned warp_id,
+                          unsigned sch_id);
 
   void create_front_pipeline();
   void create_schedulers();
@@ -2313,6 +2625,7 @@ class simt_core_cluster {
   void cache_invalidate();
   bool icnt_injection_buffer_full(unsigned size, bool write);
   void icnt_inject_request_packet(class mem_fetch *mf);
+  void update_icnt_stats(class mem_fetch *mf);
 
   // for perfect memory interface
   bool response_queue_full() {
@@ -2374,6 +2687,50 @@ class exec_simt_core_cluster : public simt_core_cluster {
   virtual void create_shader_core_ctx();
 };
 
+/**
+ * @brief SST cluster class
+ *
+ */
+class sst_simt_core_cluster : public exec_simt_core_cluster {
+ public:
+  sst_simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id,
+                        const shader_core_config *config,
+                        const memory_config *mem_config,
+                        class shader_core_stats *stats,
+                        class memory_stats_t *mstats)
+      : exec_simt_core_cluster(gpu, cluster_id, config, mem_config, stats,
+                               mstats) {}
+
+  /**
+   * @brief Check if SST memory request injection
+   *        buffer is full by using extern
+   *        function is_SST_buffer_full()
+   *        defined in Balar
+   *
+   * @param size
+   * @param write
+   * @param type
+   * @return true
+   * @return false
+   */
+  bool SST_injection_buffer_full(unsigned size, bool write,
+                                 mem_access_type type);
+
+  /**
+   * @brief Send memory request packets to SST
+   *        memory
+   *
+   * @param mf
+   */
+  void icnt_inject_request_packet_to_SST(class mem_fetch *mf);
+
+  /**
+   * @brief Advance ICNT between core and SST
+   *
+   */
+  void icnt_cycle_SST();
+};
+
 class shader_memory_interface : public mem_fetch_interface {
  public:
   shader_memory_interface(shader_core_ctx *core, simt_core_cluster *cluster) {
@@ -2414,6 +2771,61 @@ class perfect_memory_interface : public mem_fetch_interface {
   simt_core_cluster *m_cluster;
 };
 
+/**
+ * @brief SST memory interface
+ *
+ */
+class sst_memory_interface : public mem_fetch_interface {
+ public:
+  sst_memory_interface(shader_core_ctx *core, sst_simt_core_cluster *cluster) {
+    m_core = core;
+    m_cluster = cluster;
+  }
+  /**
+   * @brief For constant, inst, tex cache access
+   *
+   * @param size
+   * @param write
+   * @return true
+   * @return false
+   */
+  virtual bool full(unsigned size, bool write) const {
+    assert(false && "Use the full() method with access type instead!");
+    return true;
+  }
+
+  /**
+   * @brief With SST, the core will direct all mem access except for
+   *        constant, tex, and inst reads to SST mem system
+   *        (i.e. not modeling constant mem right now), thus
+   *        requiring the mem_access_type information to be passed in
+   *
+   * @param size
+   * @param write
+   * @param type
+   * @return true
+   * @return false
+   */
+  bool full(unsigned size, bool write, mem_access_type type) const {
+    return m_cluster->SST_injection_buffer_full(size, write, type);
+  }
+
+  /**
+   * @brief Push memory request to SST memory system and
+   *        update stats
+   *
+   * @param mf
+   */
+  virtual void push(mem_fetch *mf) {
+    m_core->inc_simt_to_mem(mf->get_num_flits(true));
+    m_cluster->icnt_inject_request_packet_to_SST(mf);
+  }
+
+ private:
+  shader_core_ctx *m_core;
+  sst_simt_core_cluster *m_cluster;
+};
+
 inline int scheduler_unit::get_sid() const { return m_shader->get_sid(); }
 
 #endif /* SHADER_H */
diff --git a/src/gpgpu-sim/shader_trace.h b/src/gpgpu-sim/shader_trace.h
index e7486d8b0..367262c90 100644
--- a/src/gpgpu-sim/shader_trace.h
+++ b/src/gpgpu-sim/shader_trace.h
@@ -38,7 +38,7 @@
 #define SCHED_PRINT_STR SHADER_PRINT_STR "Scheduler %d - "
 #define SHADER_DTRACE(x) \
   (DTRACE(x) &&          \
-   (Trace::sampling_core == get_sid() || Trace::sampling_core == -1))
+   (Trace::sampling_core == (int)get_sid() || Trace::sampling_core == -1))
 
 // Intended to be called from inside components of a shader core.
 // Depends on a get_sid() function
diff --git a/src/gpgpu-sim/stat-tool.cc b/src/gpgpu-sim/stat-tool.cc
index 6fafaa6af..08bbe9e02 100644
--- a/src/gpgpu-sim/stat-tool.cc
+++ b/src/gpgpu-sim/stat-tool.cc
@@ -369,8 +369,6 @@ void shader_mem_lat_print(FILE *fout) {
 static int s_cache_access_logger_n_types = 0;
 static std::vector<linear_histogram_logger> s_cache_access_logger;
 
-enum cache_access_logger_types { NORMALS, TEXTURE, CONSTANT, INSTRUCTION };
-
 int get_shader_normal_cache_id() { return NORMALS; }
 int get_shader_texture_cache_id() { return TEXTURE; }
 int get_shader_constant_cache_id() { return CONSTANT; }
@@ -521,7 +519,7 @@ void thread_insn_span::print_span(FILE *fout) const {
   fprintf(fout, "%d: ", (int)m_cycle);
   span_count_map::const_iterator i_sc = m_insn_span_count.begin();
   for (; i_sc != m_insn_span_count.end(); ++i_sc) {
-    fprintf(fout, "%d ", i_sc->first);
+    fprintf(fout, "%llx ", i_sc->first);
   }
   fprintf(fout, "\n");
 }
diff --git a/src/gpgpu-sim/stat-tool.h b/src/gpgpu-sim/stat-tool.h
index 3a291be3a..fdf875600 100644
--- a/src/gpgpu-sim/stat-tool.h
+++ b/src/gpgpu-sim/stat-tool.h
@@ -268,6 +268,8 @@ class linear_histogram_logger : public snap_shot_trigger,
   static int s_ids;
 };
 
+enum cache_access_logger_types { NORMALS, TEXTURE, CONSTANT, INSTRUCTION };
+
 void try_snap_shot(unsigned long long current_cycle);
 void set_spill_interval(unsigned long long interval);
 void spill_log_to_file(FILE *fout, int final, unsigned long long current_cycle);
diff --git a/src/gpgpusim_entrypoint.cc b/src/gpgpusim_entrypoint.cc
index f4287d8a7..839fef619 100644
--- a/src/gpgpusim_entrypoint.cc
+++ b/src/gpgpusim_entrypoint.cc
@@ -43,6 +43,20 @@
 static int sg_argc = 3;
 static const char *sg_argv[] = {"", "-config", "gpgpusim.config"};
 
+// Help funcs to avoid multiple '->' for SST
+GPGPUsim_ctx *GPGPUsim_ctx_ptr() { return GPGPU_Context()->the_gpgpusim; }
+
+class sst_gpgpu_sim *g_the_gpu() {
+  return static_cast<sst_gpgpu_sim *>(GPGPUsim_ctx_ptr()->g_the_gpu);
+}
+
+class stream_manager *g_stream_manager() {
+  return GPGPUsim_ctx_ptr()->g_stream_manager;
+}
+
+// SST callback
+extern void SST_callback_cudaThreadSynchronize_done();
+
 void *gpgpu_sim_thread_sequential(void *ctx_ptr) {
   gpgpu_context *ctx = (gpgpu_context *)ctx_ptr;
   // at most one kernel running at a time
@@ -57,7 +71,8 @@ void *gpgpu_sim_thread_sequential(void *ctx_ptr) {
         ctx->the_gpgpusim->g_the_gpu->cycle();
         ctx->the_gpgpusim->g_the_gpu->deadlock_check();
       }
-      ctx->the_gpgpusim->g_the_gpu->print_stats();
+      ctx->the_gpgpusim->g_the_gpu->print_stats(
+          ctx->the_gpgpusim->g_the_gpu->last_streamID);
       ctx->the_gpgpusim->g_the_gpu->update_stats();
       ctx->print_simulation_time();
     }
@@ -144,7 +159,8 @@ void *gpgpu_sim_thread_concurrent(void *ctx_ptr) {
       fflush(stdout);
     }
     if (sim_cycles) {
-      ctx->the_gpgpusim->g_the_gpu->print_stats();
+      ctx->the_gpgpusim->g_the_gpu->print_stats(
+          ctx->the_gpgpusim->g_the_gpu->last_streamID);
       ctx->the_gpgpusim->g_the_gpu->update_stats();
       ctx->print_simulation_time();
     }
@@ -167,6 +183,75 @@ void *gpgpu_sim_thread_concurrent(void *ctx_ptr) {
   return NULL;
 }
 
+bool sst_sim_cycles = false;
+
+bool SST_Cycle() {
+  // Check if Synchronize is done when SST previously requested
+  // cudaThreadSynchronize
+  if (GPGPU_Context()->requested_synchronize &&
+      ((g_stream_manager()->empty() && !GPGPUsim_ctx_ptr()->g_sim_active) ||
+       GPGPUsim_ctx_ptr()->g_sim_done)) {
+    SST_callback_cudaThreadSynchronize_done();
+    GPGPU_Context()->requested_synchronize = false;
+  }
+
+  if (g_stream_manager()->empty_protected() &&
+      !GPGPUsim_ctx_ptr()->g_sim_done && !g_the_gpu()->active()) {
+    GPGPUsim_ctx_ptr()->g_sim_active = false;
+    // printf("stream is empty %d \n",  g_stream_manager->empty());
+    return false;
+  }
+
+  if (g_stream_manager()->operation(&sst_sim_cycles) &&
+      !g_the_gpu()->active()) {
+    if (sst_sim_cycles) {
+      sst_sim_cycles = false;
+    }
+    return false;
+  }
+
+  // printf("GPGPU-Sim: Give GPU Cycle\n");
+  GPGPUsim_ctx_ptr()->g_sim_active = true;
+
+  // functional simulation
+  if (g_the_gpu()->is_functional_sim()) {
+    kernel_info_t *kernel = g_the_gpu()->get_functional_kernel();
+    assert(kernel);
+    GPGPUsim_ctx_ptr()->gpgpu_ctx->func_sim->gpgpu_cuda_ptx_sim_main_func(
+        *kernel);
+    g_the_gpu()->finish_functional_sim(kernel);
+  }
+
+  // performance simulation
+  if (g_the_gpu()->active()) {
+    g_the_gpu()->SST_cycle();
+    sst_sim_cycles = true;
+    g_the_gpu()->deadlock_check();
+  } else {
+    if (g_the_gpu()->cycle_insn_cta_max_hit()) {
+      g_stream_manager()->stop_all_running_kernels();
+      GPGPUsim_ctx_ptr()->g_sim_done = true;
+      GPGPUsim_ctx_ptr()->g_sim_active = false;
+      GPGPUsim_ctx_ptr()->break_limit = true;
+    }
+  }
+
+  if (!g_the_gpu()->active()) {
+    g_the_gpu()->print_stats(GPGPUsim_ctx_ptr()->g_the_gpu->last_streamID);
+    g_the_gpu()->update_stats();
+    GPGPU_Context()->print_simulation_time();
+  }
+
+  if (GPGPUsim_ctx_ptr()->break_limit) {
+    printf(
+        "GPGPU-Sim: ** break due to reaching the maximum cycles (or "
+        "instructions) **\n");
+    return true;
+  }
+
+  return false;
+}
+
 void gpgpu_context::synchronize() {
   printf("GPGPU-Sim: synchronize waiting for inactive GPU simulation\n");
   the_gpgpusim->g_stream_manager->print(stdout);
@@ -185,6 +270,27 @@ void gpgpu_context::synchronize() {
   //    sem_post(&g_sim_signal_start);
 }
 
+bool gpgpu_context::synchronize_check() {
+  // printf("GPGPU-Sim: synchronize checking for inactive GPU simulation\n");
+  requested_synchronize = true;
+  the_gpgpusim->g_stream_manager->print(stdout);
+  fflush(stdout);
+  //    sem_wait(&g_sim_signal_finish);
+  bool done = false;
+  pthread_mutex_lock(&(the_gpgpusim->g_sim_lock));
+  done = (the_gpgpusim->g_stream_manager->empty() &&
+          !the_gpgpusim->g_sim_active) ||
+         the_gpgpusim->g_sim_done;
+  pthread_mutex_unlock(&(the_gpgpusim->g_sim_lock));
+  if (done) {
+    printf(
+        "GPGPU-Sim: synchronize checking: detected inactive GPU simulation "
+        "thread\n");
+  }
+  fflush(stdout);
+  return done;
+}
+
 void gpgpu_context::exit_simulation() {
   the_gpgpusim->g_sim_done = true;
   printf("GPGPU-Sim: exit_simulation called\n");
@@ -218,8 +324,14 @@ gpgpu_sim *gpgpu_context::gpgpu_ptx_sim_init_perf() {
   assert(setlocale(LC_NUMERIC, "C"));
   the_gpgpusim->g_the_gpu_config->init();
 
-  the_gpgpusim->g_the_gpu =
-      new exec_gpgpu_sim(*(the_gpgpusim->g_the_gpu_config), this);
+  if (the_gpgpusim->g_the_gpu_config->is_SST_mode()) {
+    // Create SST specific GPGPUSim
+    the_gpgpusim->g_the_gpu =
+        new sst_gpgpu_sim(*(the_gpgpusim->g_the_gpu_config), this);
+  } else {
+    the_gpgpusim->g_the_gpu =
+        new exec_gpgpu_sim(*(the_gpgpusim->g_the_gpu_config), this);
+  }
   the_gpgpusim->g_stream_manager = new stream_manager(
       (the_gpgpusim->g_the_gpu), func_sim->g_cuda_launch_blocking);
 
@@ -235,12 +347,17 @@ gpgpu_sim *gpgpu_context::gpgpu_ptx_sim_init_perf() {
 void gpgpu_context::start_sim_thread(int api) {
   if (the_gpgpusim->g_sim_done) {
     the_gpgpusim->g_sim_done = false;
-    if (api == 1) {
-      pthread_create(&(the_gpgpusim->g_simulation_thread), NULL,
-                     gpgpu_sim_thread_concurrent, (void *)this);
+    if (the_gpgpusim->g_the_gpu_config->is_SST_mode()) {
+      // Do not create concurrent thread in SST mode
+      g_the_gpu()->init();
     } else {
-      pthread_create(&(the_gpgpusim->g_simulation_thread), NULL,
-                     gpgpu_sim_thread_sequential, (void *)this);
+      if (api == 1) {
+        pthread_create(&(the_gpgpusim->g_simulation_thread), NULL,
+                       gpgpu_sim_thread_concurrent, (void *)this);
+      } else {
+        pthread_create(&(the_gpgpusim->g_simulation_thread), NULL,
+                       gpgpu_sim_thread_sequential, (void *)this);
+      }
     }
   }
 }
@@ -264,8 +381,13 @@ void gpgpu_context::print_simulation_time() {
   const unsigned cycles_per_sec =
       (unsigned)(the_gpgpusim->g_the_gpu->gpu_tot_sim_cycle / difference);
   printf("gpgpu_simulation_rate = %u (cycle/sec)\n", cycles_per_sec);
-  printf("gpgpu_silicon_slowdown = %ux\n",
-         the_gpgpusim->g_the_gpu->shader_clock() * 1000 / cycles_per_sec);
+
+  if (cycles_per_sec == 0) {
+    printf("gpgpu_silicon_slowdown = Nan\n");
+  } else {
+    printf("gpgpu_silicon_slowdown = %ux\n",
+           the_gpgpusim->g_the_gpu->shader_clock() * 1000 / cycles_per_sec);
+  }
   fflush(stdout);
 }
 
diff --git a/src/intersim2/CMakeLists.txt b/src/intersim2/CMakeLists.txt
new file mode 100644
index 000000000..c3da1b1da
--- /dev/null
+++ b/src/intersim2/CMakeLists.txt
@@ -0,0 +1,106 @@
+option(GPGPUSIM_INTERSIM_STANDALONE "Whether to also build intersim in standalone mode" OFF)
+
+# Specify Flex and Bison target
+BISON_TARGET(intersim_config_parser config.y ${CMAKE_CURRENT_BINARY_DIR}/y.tab.c
+            COMPILE_FLAGS "-y -d --file-prefix=${CMAKE_CURRENT_BINARY_DIR}/y")
+FLEX_TARGET(intersim_config_lexer config.l ${CMAKE_CURRENT_BINARY_DIR}/lex.yy.c)
+ADD_FLEX_BISON_DEPENDENCY(intersim_config_lexer intersim_config_parser)
+
+# Set generated source files to CXX
+set_source_files_properties(${BISON_intersim_config_parser_OUTPUT_SOURCE} 
+                            ${FLEX_intersim_config_lexer_OUTPUTS}
+                            PROPERTIES LANGUAGE CXX)
+
+# Create booksim or libintersim.a
+# Shared include path
+list(APPEND intersim_INC ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/allocators
+    ${CMAKE_CURRENT_SOURCE_DIR}/arbiters
+    ${CMAKE_CURRENT_SOURCE_DIR}/networks
+    ${CMAKE_CURRENT_SOURCE_DIR}/power
+    ${CMAKE_CURRENT_SOURCE_DIR}/routers
+    ${PROJECT_SOURCE_DIR}/src)
+
+# Shared source files
+list(APPEND intersim_SRC
+        ${BISON_intersim_config_parser_OUTPUT_SOURCE}
+        ${FLEX_intersim_config_lexer_OUTPUTS}
+        allocators/allocator.cpp
+        allocators/islip.cpp
+        allocators/loa.cpp
+        allocators/maxsize.cpp
+        allocators/pim.cpp
+        allocators/selalloc.cpp
+        allocators/separable.cpp
+        allocators/separable_input_first.cpp
+        allocators/separable_output_first.cpp
+        allocators/wavefront.cpp
+        arbiters/arbiter.cpp
+        arbiters/matrix_arb.cpp
+        arbiters/prio_arb.cpp
+        arbiters/roundrobin_arb.cpp
+        arbiters/tree_arb.cpp
+        batchtrafficmanager.cpp
+        booksim_config.cpp
+        buffer.cpp
+        buffer_state.cpp
+        config_utils.cpp
+        credit.cpp
+        flitchannel.cpp
+        flit.cpp
+        gputrafficmanager.cpp
+        injection.cpp
+        interconnect_interface.cpp
+        intersim_config.cpp
+        main.cpp
+        misc_utils.cpp
+        module.cpp
+        networks/anynet.cpp
+        networks/cmesh.cpp
+        networks/dragonfly.cpp
+        networks/fattree.cpp
+        networks/flatfly_onchip.cpp
+        networks/fly.cpp
+        networks/kncube.cpp
+        networks/network.cpp
+        networks/qtree.cpp
+        networks/tree4.cpp
+        outputset.cpp
+        packet_reply_info.cpp
+        power/buffer_monitor.cpp
+        power/power_module.cpp
+        power/switch_monitor.cpp
+        rng_double_wrapper.cpp
+        rng_wrapper.cpp
+        routefunc.cpp
+        routers/chaos_router.cpp
+        routers/event_router.cpp
+        routers/iq_router.cpp
+        routers/router.cpp
+        stats.cpp
+        traffic.cpp
+        trafficmanager.cpp
+        vc.cpp)
+
+# If standalone, also build for it
+if(GPGPUSIM_INTERSIM_STANDALONE)
+    list(REMOVE_ITEM ${intersim_SRC} interconnect_interface.cpp)
+    add_executable(booksim ${intersim_SRC})
+    target_include_directories(booksim PUBLIC 
+        ${intersim_INC})
+    target_include_directories(booksim PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+    target_include_directories(booksim PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+    # Remove globally set TRACING_ON flag
+    target_compile_options(booksim PRIVATE -UTRACING_ON)
+endif()
+
+# Specify sources for libintersim.a
+add_library(intersim STATIC ${intersim_SRC})
+target_include_directories(intersim PUBLIC 
+    ${intersim_INC}
+    ${PROJECT_SOURCE_DIR}/src/gpgpu-sim)
+target_include_directories(intersim PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+target_include_directories(intersim PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+target_compile_definitions(intersim PRIVATE CREATE_LIBRARY)
+# Remove globally set TRACING_ON flag
+target_compile_options(intersim PRIVATE -UTRACING_ON)
diff --git a/src/intersim2/Makefile b/src/intersim2/Makefile
index 3eeeb7041..a7485e23f 100644
--- a/src/intersim2/Makefile
+++ b/src/intersim2/Makefile
@@ -28,7 +28,7 @@
 # Makefile
 #
 CXX = g++
-CC = gcc
+CC = g++
 CREATE_LIBRARY ?= 0
 INTERFACE = interconnect_interface.cpp
 DEBUG ?= 0
@@ -136,10 +136,10 @@ depend:
 	makedepend -f$(OBJDIR)/Makefile.makedepend -I$(INCPATH) -p$(OBJDIR)/ $(ALL_SRCS) 2> /dev/null
 
 ${LEX_OBJS}: $(OBJDIR)/lex.yy.c $(OBJDIR)/y.tab.h
-	$(CC) $(CPPFLAGS) -c $< -o $@
+	$(CC) -Wno-unused-function $(CPPFLAGS) -c $< -o $@
 
 ${YACC_OBJS}: $(OBJDIR)/y.tab.c $(OBJDIR)/y.tab.h
-	$(CC) $(CPPFLAGS) -c $< -o $@
+	$(CC) -Wno-unused-function $(CPPFLAGS) -c $< -o $@
 
 ${OBJDIR}/%.o: %.cpp 
 	$(CXX) $(CPPFLAGS) -c $< -o $@
diff --git a/src/intersim2/config_utils.cpp b/src/intersim2/config_utils.cpp
index fad5fceb1..a896a93a6 100644
--- a/src/intersim2/config_utils.cpp
+++ b/src/intersim2/config_utils.cpp
@@ -199,27 +199,27 @@ Configuration * Configuration::GetTheConfig()
 
 //============================================================
 
-extern "C" void config_error( char const * msg, int lineno )
+void config_error( char * msg, int lineno )
 {
   Configuration::GetTheConfig( )->ParseError( msg, lineno );
 }
 
-extern "C" void config_assign_string( char const * field, char const * value )
+ void config_assign_string( char const * field, char const * value )
 {
   Configuration::GetTheConfig()->Assign(field, value);
 }
 
-extern "C" void config_assign_int( char const * field, int value )
+void config_assign_int( char const * field, int value )
 {
   Configuration::GetTheConfig()->Assign(field, value);
 }
 
-extern "C" void config_assign_float( char const * field, double value )
+void config_assign_float( char const * field, double value )
 {
   Configuration::GetTheConfig()->Assign(field, value);
 }
 
-extern "C" int config_input(char * line, int max_size)
+int config_input(char * line, int max_size)
 {
   return Configuration::GetTheConfig()->Input(line, max_size);
 }
diff --git a/src/intersim2/config_utils.hpp b/src/intersim2/config_utils.hpp
index de3343bb0..1d960b6ab 100644
--- a/src/intersim2/config_utils.hpp
+++ b/src/intersim2/config_utils.hpp
@@ -35,7 +35,7 @@
 #include<map>
 #include<vector>
 
-extern "C" int yyparse();
+int yyparse();
 
 class Configuration {
   static Configuration * theConfig;
diff --git a/src/intersim2/interconnect_interface.cpp b/src/intersim2/interconnect_interface.cpp
index 1e1a2d73b..438852e0a 100644
--- a/src/intersim2/interconnect_interface.cpp
+++ b/src/intersim2/interconnect_interface.cpp
@@ -200,7 +200,7 @@ void InterconnectInterface::Push(unsigned input_deviceID, unsigned output_device
 void* InterconnectInterface::Pop(unsigned deviceID)
 {
   int icntID = _node_map[deviceID];
-#if DEBUG
+#if 0
   cout<<"Call interconnect POP  " << output<<endl;
 #endif
 
diff --git a/src/intersim2/networks/anynet.cpp b/src/intersim2/networks/anynet.cpp
index 4db1dfbf5..d7c6f22b6 100644
--- a/src/intersim2/networks/anynet.cpp
+++ b/src/intersim2/networks/anynet.cpp
@@ -491,7 +491,7 @@ void AnyNet::readFile(){
   }
   sort(node_check.begin(), node_check.end());
   for(size_t i = 0; i<node_check.size(); i++){
-    if(node_check[i] != i){
+    if(node_check[i] != (int)i){
       cout<<"Anynet:booksim trafficmanager assumes sequential node numbering starting at 0\n";
       assert(false);
     }
diff --git a/src/intersim2/networks/dragonfly.cpp b/src/intersim2/networks/dragonfly.cpp
index 01a2281d9..f5b637e85 100644
--- a/src/intersim2/networks/dragonfly.cpp
+++ b/src/intersim2/networks/dragonfly.cpp
@@ -111,7 +111,7 @@ int dragonfly_port(int rID, int source, int dest){
   int dest_grp_ID = int(dest/_grp_num_nodes);
   int grp_output=-1;
   int grp_RID=-1;
-  int group_dest=-1;
+  // int group_dest=-1;
   
   //which router within this group the packet needs to go to
   if (dest_grp_ID == grp_ID) {
@@ -123,7 +123,7 @@ int dragonfly_port(int rID, int source, int dest){
       grp_output = dest_grp_ID - 1;
     }
     grp_RID = int(grp_output /gP) + grp_ID * _grp_num_routers;
-    group_dest = grp_RID * gP;
+    // group_dest = grp_RID * gP;
   }
 
   //At the last hop
@@ -221,7 +221,7 @@ void DragonFlyNew::_BuildNet( const Configuration &config )
   int _input=-1;
   int _dim_ID=-1;
   int _num_ports_per_switch=-1;
-  int _dim_size=-1;
+  // int _dim_size=-1;
   int c;
 
   ostringstream router_name;
@@ -314,7 +314,7 @@ void DragonFlyNew::_BuildNet( const Configuration &config )
     // intra-group GROUP channels
     for ( int dim = 0; dim < _n; ++dim ) {
 
-      _dim_size = powi(_k,dim);
+      // _dim_size = powi(_k,dim);
 
       _dim_ID = ((int) (node / ( powi(_p, dim))));
 
@@ -356,16 +356,16 @@ void DragonFlyNew::_BuildNet( const Configuration &config )
 
 
     // add INPUT channels -- "optical" channels connecting the groups
-    int _grp_num_routers;
+    // int _grp_num_routers;
     int grp_output;
-    int grp_ID2;
+    // int grp_ID2;
 
     for ( int cnt = 0; cnt < _p; ++cnt ) {
       //	   _dim_ID
       grp_output = _dim_ID* _p + cnt;
 
-      _grp_num_routers = powi(_k, _n-1);
-      grp_ID2 = (int) ((grp_ID - 1) / (_k - 1));
+      // _grp_num_routers = powi(_k, _n-1);
+      // grp_ID2 = (int) ((grp_ID - 1) / (_k - 1));
 
       if ( grp_ID > grp_output)   {
 
@@ -495,8 +495,8 @@ void ugal_dragonflynew( const Router *r, const Flit *f, int in_channel,
   int debug = f->watch;
   int out_port = -1;
   int out_vc = 0;
-  int min_queue_size, min_hopcnt;
-  int nonmin_queue_size, nonmin_hopcnt;
+  int min_queue_size; //, min_hopcnt;
+  int nonmin_queue_size; //, nonmin_hopcnt;
   int intm_grp_ID;
   int intm_rID;
 
@@ -523,13 +523,13 @@ void ugal_dragonflynew( const Router *r, const Flit *f, int in_channel,
 	f->ph = 1;
       } else {
 	//congestion metrics using queue length, obtained by GetUsedCredit()
-	min_hopcnt = dragonflynew_hopcnt(f->src, f->dest);
+	// min_hopcnt = dragonflynew_hopcnt(f->src, f->dest);
 	min_router_output = dragonfly_port(rID, f->src, f->dest); 
       	min_queue_size = max(r->GetUsedCredit(min_router_output), 0) ; 
 
       
-	nonmin_hopcnt = dragonflynew_hopcnt(f->src, f->intm) +
-	  dragonflynew_hopcnt(f->intm,f->dest);
+	// nonmin_hopcnt = dragonflynew_hopcnt(f->src, f->intm) +
+	//   dragonflynew_hopcnt(f->intm,f->dest);
 	nonmin_router_output = dragonfly_port(rID, f->src, f->intm);
 	nonmin_queue_size = max(r->GetUsedCredit(nonmin_router_output), 0);
 
diff --git a/src/intersim2/networks/flatfly_onchip.cpp b/src/intersim2/networks/flatfly_onchip.cpp
index fd17c1a41..df4337175 100644
--- a/src/intersim2/networks/flatfly_onchip.cpp
+++ b/src/intersim2/networks/flatfly_onchip.cpp
@@ -1204,7 +1204,7 @@ void ugal_pni_flatfly_onchip( const Router *r, const Flit *f, int in_channel,
 int find_distance (int src, int dest) {
   int dist = 0;
   int _dim   = gN;
-  int _dim_size;
+  // int _dim_size;
   
   int src_tmp= (int) src / gC;
   int dest_tmp = (int) dest / gC;
@@ -1212,7 +1212,7 @@ int find_distance (int src, int dest) {
   
   //  cout << " HOP CNT between  src: " << src << " dest: " << dest;
   for (int d=0;d < _dim; d++) {
-    _dim_size = powi(gK, d )*gC;
+    // _dim_size = powi(gK, d )*gC;
     //if ((int)(src / _dim_size) !=  (int)(dest / _dim_size))
     //   dist++;
     src_id = src_tmp % gK;
diff --git a/src/intersim2/networks/kncube.cpp b/src/intersim2/networks/kncube.cpp
index 03e13e713..178c90594 100644
--- a/src/intersim2/networks/kncube.cpp
+++ b/src/intersim2/networks/kncube.cpp
@@ -231,7 +231,7 @@ void KNCube::InsertRandomFaults( const Configuration &config )
   int num_fails;
   unsigned long prev_seed;
 
-  int node, chan;
+  int node, chan = 0;
   int i, j, t, n, c;
   bool available;
 
diff --git a/src/intersim2/networks/qtree.cpp b/src/intersim2/networks/qtree.cpp
index 72149475e..37d3d7c7d 100644
--- a/src/intersim2/networks/qtree.cpp
+++ b/src/intersim2/networks/qtree.cpp
@@ -84,7 +84,7 @@ void QTree::_BuildNet( const Configuration& config )
 {
 
   ostringstream routerName;
-  int h, r, pos, port;
+  int h, r = 0 , pos, port;
 
   for (h = 0; h < _n; h++) {
     for (pos = 0 ; pos < powi( _k, h ) ; ++pos ) {
diff --git a/src/intersim2/vc.cpp b/src/intersim2/vc.cpp
index 94e8c6bf6..4c9444526 100644
--- a/src/intersim2/vc.cpp
+++ b/src/intersim2/vc.cpp
@@ -82,7 +82,7 @@ void VC::AddFlit( Flit *f )
   assert(f);
 
   if(_expected_pid >= 0) {
-    if(f->pid != _expected_pid) {
+    if((long long int)f->pid != _expected_pid) {
       ostringstream err;
       err << "Received flit " << f->id << " with unexpected packet ID: " << f->pid 
 	  << " (expected: " << _expected_pid << ")";
diff --git a/src/stream_manager.cc b/src/stream_manager.cc
index e99bf8783..b974791d0 100644
--- a/src/stream_manager.cc
+++ b/src/stream_manager.cc
@@ -34,6 +34,12 @@
 
 unsigned CUstream_st::sm_next_stream_uid = 0;
 
+// SST memcpy callbacks
+extern void SST_callback_memcpy_H2D_done();
+extern void SST_callback_memcpy_D2H_done();
+extern void SST_callback_memcpy_to_symbol_done();
+extern void SST_callback_memcpy_from_symbol_done();
+
 CUstream_st::CUstream_st() {
   m_pending = false;
   m_uid = sm_next_stream_uid++;
@@ -122,11 +128,13 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) {
       if (g_debug_execution >= 3) printf("memcpy host-to-device\n");
       gpu->memcpy_to_gpu(m_device_address_dst, m_host_address_src, m_cnt);
       m_stream->record_next_done();
+      if (gpu->is_SST_mode()) SST_callback_memcpy_H2D_done();
       break;
     case stream_memcpy_device_to_host:
       if (g_debug_execution >= 3) printf("memcpy device-to-host\n");
       gpu->memcpy_from_gpu(m_host_address_dst, m_device_address_src, m_cnt);
       m_stream->record_next_done();
+      if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done();
       break;
     case stream_memcpy_device_to_device:
       if (g_debug_execution >= 3) printf("memcpy device-to-device\n");
@@ -138,12 +146,14 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) {
       gpu->gpgpu_ctx->func_sim->gpgpu_ptx_sim_memcpy_symbol(
           m_symbol, m_host_address_src, m_cnt, m_offset, 1, gpu);
       m_stream->record_next_done();
+      if (gpu->is_SST_mode()) SST_callback_memcpy_to_symbol_done();
       break;
     case stream_memcpy_from_symbol:
       if (g_debug_execution >= 3) printf("memcpy from symbol\n");
       gpu->gpgpu_ctx->func_sim->gpgpu_ptx_sim_memcpy_symbol(
           m_symbol, m_host_address_dst, m_cnt, m_offset, 0, gpu);
       m_stream->record_next_done();
+      if (gpu->is_SST_mode()) SST_callback_memcpy_from_symbol_done();
       break;
     case stream_kernel_launch:
       if (m_sim_mode) {  // Functional Sim
@@ -227,6 +237,8 @@ void stream_operation::print(FILE *fp) const {
     case stream_no_op:
       fprintf(fp, "no-op");
       break;
+    default:
+      break;
   }
 }
 
@@ -300,6 +312,14 @@ bool stream_manager::register_finished_kernel(unsigned grid_uid) {
 void stream_manager::stop_all_running_kernels() {
   pthread_mutex_lock(&m_lock);
 
+  std::vector<unsigned long long> finished_streams;
+  std::vector<kernel_info_t *> running_kernels = m_gpu->get_running_kernels();
+  for (kernel_info_t *k : running_kernels) {
+    if (k != NULL) {
+      finished_streams.push_back(k->get_streamID());
+    }
+  }
+
   // Signal m_gpu to stop all running kernels
   m_gpu->stop_all_running_kernels();
 
@@ -310,7 +330,9 @@ void stream_manager::stop_all_running_kernels() {
   }
 
   // If any kernels completed, print out the current stats
-  if (count > 0) m_gpu->print_stats();
+  for (unsigned long long streamID : finished_streams) {
+    m_gpu->print_stats(streamID);
+  }
 
   pthread_mutex_unlock(&m_lock);
 }
@@ -460,7 +482,7 @@ void stream_manager::push(stream_operation op) {
   }
   if (g_debug_execution >= 3) print_impl(stdout);
   pthread_mutex_unlock(&m_lock);
-  if (m_cuda_launch_blocking || stream == NULL) {
+  if (!m_gpu->is_SST_mode() && (m_cuda_launch_blocking || stream == NULL)) {
     unsigned int wait_amount = 100;
     unsigned int wait_cap = 100000;  // 100ms
     while (!empty()) {
diff --git a/src/stream_manager.h b/src/stream_manager.h
index afcbb0e41..561f54b87 100644
--- a/src/stream_manager.h
+++ b/src/stream_manager.h
@@ -73,7 +73,7 @@ struct CUevent_st {
   int m_uid;
   bool m_blocking;
   bool m_done;
-  int m_updates;
+  unsigned int m_updates;
   unsigned int m_issued;
   time_t m_wallclock;
   double m_gpu_tot_sim_cycle;
diff --git a/version b/version
index 1a1a990cd..09e18b115 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-const char *g_gpgpusim_version_string = "GPGPU-Sim Simulator Version 4.0.0 ";
+const char *g_gpgpusim_version_string = "GPGPU-Sim Simulator Version 4.2.0 ";
diff --git a/version.in b/version.in
new file mode 100644
index 000000000..2935c6f56
--- /dev/null
+++ b/version.in
@@ -0,0 +1 @@
+const char *g_gpgpusim_build_string="@GPGPUSIM_BUILD_STRING@";
\ No newline at end of file
diff --git a/version_detection.mk b/version_detection.mk
index ee71a6240..0bf309500 100644
--- a/version_detection.mk
+++ b/version_detection.mk
@@ -32,7 +32,7 @@ else
 GPGPUSIM_VERSION=$(shell cat $(GPGPUSIM_ROOT)/version | awk '/Version/ {print $$8}' )
 
 #Detect Git branch and commit #
-GIT_COMMIT := $(shell git log -n 1 | head -1 | sed -re 's/commit (.*)/\1/')
+GIT_COMMIT := $(shell git log --abbrev-commit -n 1 | head -1 | sed -re 's/commit (.*)/\1/')
 GIT_FILES_CHANGED_A:=$(shell git diff --numstat | wc | sed -re 's/^\s+([0-9]+).*/\1./')
 GIT_FILES_CHANGED:= $(GIT_FILES_CHANGED_A)$(shell git diff --numstat --cached | wc | sed -re 's/^\s+([0-9]+).*/\1/')
 GPGPUSIM_BUILD := gpgpu-sim_git-commit-$(GIT_COMMIT)_modified_$(GIT_FILES_CHANGED)
@@ -43,7 +43,7 @@ CUDA_VERSION_STRING:=$(shell $(CUDA_INSTALL_PATH)/bin/nvcc --version | awk '/rel
 CUDART_VERSION:=$(shell echo $(CUDA_VERSION_STRING) | sed 's/\./ /' | awk '{printf("%02u%02u", 10*int($$1), 10*$$2);}')
 
 # Detect GCC Version 
-CC_VERSION := $(shell gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($$i,/^[0-9]\.[0-9]\.[0-9]$$/))  {print $$i; exit 0 }}}')
+CC_VERSION := $(shell gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($$i,/^[0-9]+\.[0-9]+\.[0-9]+$$/)) {print $$i; exit 0 }}}')
 
 # Detect Support for C++11 (C++0x) from GCC Version 
-GNUC_CPP0X := $(shell gcc --version | perl -ne 'if (/gcc\s+\(.*\)\s+([0-9.]+)/){ if($$1 >= 4.3) {$$n=1} else {$$n=0;} } END { print $$n; }')
+GNUC_CPP0X := 1