forked from gpgpu-sim/gpgpu-sim_distribution
-
Notifications
You must be signed in to change notification settings - Fork 65
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
667834c
commit 140228d
Showing
2 changed files
with
254 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
//21*1 fly with 32 flits per packet under gpgpusim injection mode | ||
use_map = 0; | ||
flit_size = 40; | ||
|
||
// currently we do not use this, see subnets below | ||
network_count = 2; | ||
|
||
// Topology | ||
topology = fly; | ||
k = 78; | ||
n = 1; | ||
|
||
// Routing | ||
|
||
routing_function = dest_tag; | ||
|
||
|
||
// Flow control | ||
|
||
num_vcs = 1; | ||
vc_buf_size = 256; | ||
input_buffer_size = 256; | ||
ejection_buffer_size = 256; | ||
boundary_buffer_size = 256; | ||
|
||
wait_for_tail_credit = 0; | ||
|
||
// Router architecture | ||
|
||
vc_allocator = islip; //separable_input_first; | ||
sw_allocator = islip; //separable_input_first; | ||
alloc_iters = 1; | ||
|
||
credit_delay = 0; | ||
routing_delay = 0; | ||
vc_alloc_delay = 1; | ||
sw_alloc_delay = 1; | ||
|
||
input_speedup = 1; | ||
output_speedup = 1; | ||
internal_speedup = 2.0; | ||
|
||
// Traffic, GPGPU-Sim does not use this | ||
|
||
traffic = uniform; | ||
packet_size ={{1,2,3,4},{10,20}}; | ||
packet_size_rate={{1,1,1,1},{2,1}}; | ||
|
||
// Simulation - Don't change | ||
|
||
sim_type = gpgpusim; | ||
//sim_type = latency; | ||
injection_rate = 0.1; | ||
|
||
subnets = 2; | ||
|
||
// Always use read and write no matter following line | ||
//use_read_write = 1; | ||
|
||
|
||
read_request_subnet = 0; | ||
read_reply_subnet = 1; | ||
write_request_subnet = 0; | ||
write_reply_subnet = 1; | ||
|
||
read_request_begin_vc = 0; | ||
read_request_end_vc = 0; | ||
write_request_begin_vc = 0; | ||
write_request_end_vc = 0; | ||
read_reply_begin_vc = 0; | ||
read_reply_end_vc = 0; | ||
write_reply_begin_vc = 0; | ||
write_reply_end_vc = 0; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
# functional simulator specification | ||
-gpgpu_ptx_instruction_classification 0 | ||
-gpgpu_ptx_sim_mode 0 | ||
-gpgpu_ptx_force_max_capability 86 | ||
|
||
# Device Limits | ||
-gpgpu_stack_size_limit 1024 | ||
-gpgpu_heap_size_limit 8388608 | ||
-gpgpu_runtime_sync_depth_limit 2 | ||
-gpgpu_runtime_pending_launch_count_limit 2048 | ||
-gpgpu_kernel_launch_latency 5000 | ||
-gpgpu_TB_launch_latency 0 | ||
-gpgpu_max_concurrent_kernel 128 | ||
|
||
# Compute Capability | ||
-gpgpu_compute_capability_major 8 | ||
-gpgpu_compute_capability_minor 6 | ||
|
||
# PTX execution-driven | ||
-gpgpu_ptx_convert_to_ptxplus 0 | ||
-gpgpu_ptx_save_converted_ptxplus 0 | ||
|
||
# high level architecture configuration | ||
-gpgpu_n_clusters 46 | ||
-gpgpu_n_cores_per_cluster 1 | ||
-gpgpu_n_mem 16 | ||
-gpgpu_n_sub_partition_per_mchannel 2 | ||
|
||
# clock domains | ||
#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock> | ||
-gpgpu_clock_domains 1132:1132:1132:3500.5 | ||
|
||
# shader core pipeline config | ||
-gpgpu_shader_registers 65536 | ||
-gpgpu_registers_per_block 65536 | ||
-gpgpu_occupancy_sm_number 86 | ||
|
||
-gpgpu_shader_core_pipeline 1536:32 | ||
-gpgpu_shader_cta 32 | ||
-gpgpu_simd_model 1 | ||
|
||
# Pipeline widths and number of FUs | ||
# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE | ||
-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4 | ||
-gpgpu_num_sp_units 4 | ||
-gpgpu_num_sfu_units 4 | ||
-gpgpu_num_dp_units 4 | ||
-gpgpu_num_int_units 4 | ||
-gpgpu_tensor_core_avail 1 | ||
-gpgpu_num_tensor_core_units 4 | ||
|
||
# Instruction latencies and initiation intervals | ||
# "ADD,MAX,MUL,MAD,DIV" | ||
# All Div operations are executed on SFU unit | ||
-ptx_opcode_latency_int 4,4,4,4,21 | ||
-ptx_opcode_initiation_int 2,2,2,2,2 | ||
-ptx_opcode_latency_fp 4,4,4,4,39 | ||
-ptx_opcode_initiation_fp 1,1,1,1,2 | ||
-ptx_opcode_latency_dp 64,64,64,64,330 | ||
-ptx_opcode_initiation_dp 64,64,64,64,130 | ||
-ptx_opcode_latency_sfu 21 | ||
-ptx_opcode_initiation_sfu 8 | ||
-ptx_opcode_latency_tesnor 64 | ||
-ptx_opcode_initiation_tensor 64 | ||
|
||
# sub core model: in which each scheduler has its own register file and EUs | ||
# i.e. schedulers are isolated | ||
-gpgpu_sub_core_model 1 | ||
# disable specialized operand collectors and use generic operand collectors instead | ||
-gpgpu_enable_specialized_operand_collector 0 | ||
-gpgpu_operand_collector_num_units_gen 8 | ||
-gpgpu_operand_collector_num_in_ports_gen 8 | ||
-gpgpu_operand_collector_num_out_ports_gen 8 | ||
# register banks | ||
-gpgpu_num_reg_banks 8 | ||
-gpgpu_reg_file_port_throughput 2 | ||
|
||
# warp scheduling | ||
-gpgpu_num_sched_per_core 4 | ||
-gpgpu_scheduler lrr | ||
# a warp scheduler issue mode | ||
-gpgpu_max_insn_issue_per_warp 1 | ||
-gpgpu_dual_issue_diff_exec_units 1 | ||
|
||
## L1/shared memory configuration | ||
# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry> | ||
# ** Optional parameter - Required when mshr_type==Texture Fifo | ||
# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache | ||
# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x | ||
-gpgpu_adaptive_cache_config 1 | ||
-gpgpu_shmem_option 0,8,16,32,64,100 | ||
-gpgpu_unified_l1d_size 128 | ||
# L1 cache configuration | ||
-gpgpu_l1_banks 4 | ||
-gpgpu_cache:dl1 S:4:128:256,L:T:m:L:L,A:384:48,16:0,32 | ||
-gpgpu_l1_latency 39 | ||
-gpgpu_gmem_skip_L1D 0 | ||
-gpgpu_flush_l1_cache 1 | ||
-gpgpu_n_cluster_ejection_buffer_size 32 | ||
-gpgpu_l1_cache_write_ratio 25 | ||
|
||
# shared memory configuration | ||
-gpgpu_shmem_size 102400 | ||
-gpgpu_shmem_sizeDefault 102400 | ||
-gpgpu_shmem_per_block 49152 | ||
-gpgpu_smem_latency 29 | ||
# shared memory bankconflict detection | ||
-gpgpu_shmem_num_banks 32 | ||
-gpgpu_shmem_limited_broadcast 0 | ||
-gpgpu_shmem_warp_parts 1 | ||
-gpgpu_coalesce_arch 86 | ||
|
||
# L2 cache | ||
-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32 | ||
-gpgpu_cache:dl2_texture_only 0 | ||
-gpgpu_dram_partition_queues 64:64:64:64 | ||
-gpgpu_perf_sim_memcpy 1 | ||
-gpgpu_memory_partition_indexing 2 | ||
|
||
# 128 KB Inst. | ||
-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4 | ||
-gpgpu_inst_fetch_throughput 4 | ||
# 128 KB Tex | ||
# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod | ||
-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2 | ||
# 64 KB Const | ||
-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4 | ||
-gpgpu_perfect_inst_const_cache 1 | ||
|
||
# interconnection | ||
# use built-in local xbar | ||
-network_mode 2 | ||
-icnt_in_buffer_limit 512 | ||
-icnt_out_buffer_limit 512 | ||
-icnt_subnets 2 | ||
-icnt_flit_size 40 | ||
-icnt_arbiter_algo 1 | ||
|
||
# memory partition latency config | ||
-gpgpu_l2_rop_latency 187 | ||
-dram_latency 254 | ||
|
||
# dram sched config | ||
-gpgpu_dram_scheduler 1 | ||
-gpgpu_frfcfs_dram_sched_queue_size 64 | ||
-gpgpu_dram_return_queue_size 192 | ||
|
||
# dram model config | ||
-gpgpu_n_mem_per_ctrlr 1 | ||
-gpgpu_dram_buswidth 2 | ||
-gpgpu_dram_burst_length 16 | ||
-dram_data_command_freq_ratio 4 | ||
-gpgpu_mem_address_mask 1 | ||
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS | ||
|
||
# Mem timing | ||
-gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4 | ||
-dram_dual_bus_interface 0 | ||
|
||
# select lower bits for bnkgrp to increase bnkgrp parallelism | ||
-dram_bnk_indexing_policy 0 | ||
-dram_bnkgrp_indexing_policy 1 | ||
|
||
#-dram_seperate_write_queue_enable 1 | ||
#-dram_write_queue_size 64:56:32 | ||
|
||
# stat collection | ||
-gpgpu_memlatency_stat 14 | ||
-gpgpu_runtime_stat 500 | ||
-enable_ptx_file_line_stats 1 | ||
-visualizer_enabled 0 | ||
|
||
# power model configs, disable it untill we create a real energy model | ||
-power_simulation_enabled 0 | ||
|
||
# tracing functionality | ||
#-trace_enabled 1 | ||
#-trace_components WARP_SCHEDULER,SCOREBOARD | ||
#-trace_sampling_core 0 | ||
|