Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Running OpenMP benchmark is causing stall's #244

Open
avacoder42 opened this issue Sep 10, 2020 · 0 comments
Open

Running OpenMP benchmark is causing stall's #244

avacoder42 opened this issue Sep 10, 2020 · 0 comments

Comments

@avacoder42
Copy link

avacoder42 commented Sep 10, 2020

Hi,
I'm running zsim application in ramulator with PIM-hooks and ROI-hooks for Polybench OpenMP applications like Triangular matrix multiplication(trmm) , Cholesky decomposition.
For all dimensions from small to large datasets, I get the following warning message and the run-time is more than an hour, I'm questioning if these warning's need to be addressed and corrected? Thank you

[S 0] WARN: Futex wake matching failed (0/31) (external/ff waiters?
[H] WARN: Stalled for 20 secs so far
[H] WARN: Stalled for 30 secs so far
[H] WARN: Stalled for 20 secs so far
[H] WARN: Stalled for 30 secs so far
[H] WARN: Stalled for 20 secs so far
[H] WARN: Stalled for 20 secs so far
[H] WARN: Stalled for 20 secs so far
[H] WARN: Stalled for 30 secs so far
[H] WARN: Stalled for 20 secs so far
[H] WARN: Stalled for 20 secs so far
[H] WARN: Stalled for 30 secs so far
[H] WARN: Stalled for 20 secs so far
[H] WARN: Stalled for 30 secs so far
[H] WARN: Stalled for 40 secs so far

[H] WARN: Stalled for 20 secs so far
[H] WARN: Stalled for 30 secs so far
[H] WARN: Stalled for 20 secs so far
[S 0] Detected possible stall due to fake leaves (1 current)
[S 0]  [0/0] futex (202) @ 0x7fffe2e08111
[S 0] Blacklisting from future fake leaves: [0] futex @ 0x7fffe2e08111 | arg0 0x555555757444 arg1 0x80

Configuration is as follows -

// This system is similar to a 6-core, 2.4GHz Westmere with 10 Niagara-like cores attached to the L3
sys = {
    lineSize = 64;
    frequency = 2400;

    cores = {
        core = {
            type = "OOO";
            cores = 32;
            icache = "l1i";
            dcache = "l1d";
        };
    };

  
    caches = {
        l1d = {
            array = {
                type = "SetAssoc";
                ways = 8;
            };
            caches = 32;
            latency = 4;
            size = 32768;
        };
        l1i = {
            array = {
                type = "SetAssoc";
                ways = 4;
            };
            caches = 32;
            latency = 3;
            size = 32768;
        };
        l2 = {
            array = {
                type = "SetAssoc";
                ways = 8;
            };
	    //type = "Timing";
	    //mshrs = 10;
            caches = 32;
            latency = 7;
            children = "l1i|l1d";
            size = 262144;
        };
        l3 = {
            array = {
                hash = "H3";
                type = "SetAssoc";
                ways = 16;
            };
	    //type = "Timing";
	    //mshrs = 16;
            banks = 32;
            caches = 1;
            latency = 27;
            children = "l2";
	    size = 67108864;
        };


    };
    
    mem = {
        type = "Traces";
        instr_traces = true;
	      only_offload = true;
	      pim_traces = true;
 
        outFile = "pim-poly_cholesky_32.out"
    };

};

sim = {
    phaseLength = 10000;
    maxTotalInstrs = 10000000000L;
    statsPhaseInterval = 1000;
    printHierarchy = true;
    // attachDebugger = True;
};

process0 = {
    command = "benchmarks/PolyBench-ACC-master/OpenMP/linear-algebra/kernels/cholesky/cholesky" ;
    startFastForwarded = True;
//    command = "ls -la";
//    command = "unzip tracesLois.out.gz";
};

And code with hooks -

static
void kernel_cholesky(int n,
		     DATA_TYPE POLYBENCH_1D(p,N,n),
		     DATA_TYPE POLYBENCH_2D(A,N,N,n,n))
{
  
  int i, j, k;
  int	 num_omp_threads;
	num_omp_threads = 32;
  DATA_TYPE x;
  #pragma scop
  #pragma omp parallel
  {  
    
    #pragma omp for private (j,k)
    for (i = 0; i < _PB_N; ++i)
      { 
        zsim_PIM_function_begin();
	      x = A[i][i];
	      for (j = 0; j <= i - 1; ++j)
             
	          x = x - A[i][j] * A[i][j];
            p[i] = 1.0 /sqrt(x);
            
	      for (j = i + 1; j < _PB_N; ++j)
	        {
	          x = A[i][j];
	          for (k = 0; k <= i - 1; ++k)
	            x = x - A[j][k] * A[i][k];
	            A[j][i] = x * p[i];
	        }
          zsim_PIM_function_end(); 
      }
   
  }
  #pragma endscop
  
}


int main(int argc, char** argv)
{
  /* Retrieve problem size. */
  int n = N;

  /* Variable declaration/allocation. */
  POLYBENCH_2D_ARRAY_DECL(A, DATA_TYPE, N, N, n, n);
  POLYBENCH_1D_ARRAY_DECL(p, DATA_TYPE, N, n);


  /* Initialize array(s). */
  init_array (n, POLYBENCH_ARRAY(p), POLYBENCH_ARRAY(A));

  /* Start timer. */
  polybench_start_instruments;

  /* Run kernel. */
  zsim_roi_begin();
  kernel_cholesky (n, POLYBENCH_ARRAY(p), POLYBENCH_ARRAY(A));
  zsim_roi_end();
  /* Stop and print timer. */
  polybench_stop_instruments;
  polybench_print_instruments;

  /* Prevent dead-code elimination. All live-out data must be printed
     by the function call in argument. */
  polybench_prevent_dce(print_array(n, POLYBENCH_ARRAY(A)));

  /* Be clean. */
  POLYBENCH_FREE_ARRAY(A);
  POLYBENCH_FREE_ARRAY(p);

  return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant