etmc · Jan 7, 2019 · Jan 7, 2019 · Jan 7, 2019 · Jan 8, 2019 · Jan 8, 2019
Showing with 225 additions and 194 deletions.

+16 −7 src/DDalphaAMG.h

+102 −36 src/DDalphaAMG_interface.c

+3 −3 src/main.c

+4 −8 src/setup_generic.c

+100 −140 tests/{DDalphaAMG_proj_test.c → DDalphaAMG_benchmark.c}
diff --git a/src/DDalphaAMG.h b/src/DDalphaAMG.h
@@ -243,26 +243,35 @@
    **   vector_out = P^\dagger * vector_in.
    **  mg_status.success = 1
    **/
-  void DDalphaAMG_restrict( double *vector_out, double *vector_in, int level,
+  void DDalphaAMG_restrict( float *vector_out, float *vector_in, int level,
                             DDalphaAMG_status *mg_status );
 
   /**
    ** Optional - Prolongate vector to level:
    **   vector_out = P * vector_in.
    **  mg_status.success = 1
    **/
-  void DDalphaAMG_prolongate( double *vector_out, double *vector_in, int level,
-                            DDalphaAMG_status *mg_status );
+  void DDalphaAMG_prolongate( float *vector_out, float *vector_in, int level,
+                              DDalphaAMG_status *mg_status );
 
   /**
    ** Optional - Apply the operator:
    **   vector_out = D * vector_in.
    **  mg_status.success = 1
    **/
-  void DDalphaAMG_apply_coarse_operator( double *vector_out, double *vector_in, int level,
-                                  DDalphaAMG_status *mg_status );
+  void DDalphaAMG_apply_coarse_operator( float *vector_out, float *vector_in, int level,
+                                         DDalphaAMG_status *mg_status );
 
-/*
+  /**
+   ** Extra - coarse vector utils
+   **/
+  float* DDalphaAMG_coarse_vector_alloc( int level );
+  void DDalphaAMG_coarse_vector_free( float *vector, int level );
+  void DDalphaAMG_coarse_vector_rand( float *vector, int level );
+  float DDalphaAMG_coarse_vector_residual( float *vector_out, float *vector_in, int level );
+
+
+  /*
    *  Concluding the following functions have to be call for freeing the memory and finalizing
    * the software. 
    */
@@ -376,7 +385,7 @@
      **    from 1 (no MG) to 4 (max number of levels)
      **/
     int number_of_levels;
-    
+
     /**
      **  Number of openmp threads:
      **    from 1 to omp_get_num_threads()

diff --git a/src/DDalphaAMG_interface.c b/src/DDalphaAMG_interface.c
@@ -1515,7 +1515,7 @@ static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_i
 
 }
 
-static inline void DDalphaAMG_proj_driver( double *vector_out, double *vector_in, int level, DDalphaAMG_status *mg_status, int _TYPE ) {
+static inline void DDalphaAMG_proj_driver( float *vector_out, float *vector_in, int level, DDalphaAMG_status *mg_status, int _TYPE ) {
 
   int t, z, y, x, i, j, mu, *ll;
 
@@ -1529,12 +1529,12 @@ static inline void DDalphaAMG_proj_driver( double *vector_out, double *vector_in
   } else if(_TYPE==_PROLONGATE) {
     from=ltmp->next_level;
     to=ltmp;    
-  } else {
-    from=ltmp->next_level;
+  } else { // includes _OPERATOR
+    from=ltmp;
     to=ltmp;    
   }
-  vector_float rhs = from->p_float.b;
-  vector_float sol = to->p_float.x;
+  vector_float in = from->p_float.b;
+  vector_float out = to->p_float.x;
 
   double t0, t1;
   t0 = MPI_Wtime();
@@ -1548,58 +1548,62 @@ static inline void DDalphaAMG_proj_driver( double *vector_out, double *vector_in
   ASSERT(vector_out!=NULL);
   ASSERT(vector_in!=NULL);
 
-  ll = from->local_lattice;
-  for (t=0, j=0; t<ll[T]; t++) 
-    for (z=0; z<ll[Z]; z++) 
-      for (y=0; y<ll[Y]; y++) 
-        for (x=0; x<ll[X]; x++) {
-          if(vector_index_fct!=NULL )
+  if(vector_index_fct!=NULL ) {
+    ll = from->local_lattice;
+    for (t=0, j=0; t<ll[T]; t++) 
+      for (z=0; z<ll[Z]; z++) 
+        for (y=0; y<ll[Y]; y++) 
+          for (x=0; x<ll[X]; x++) {
             i = vector_index_fct( t, z, y, x );
-          else 
-            i = 2*j;
 
           for ( mu=0; mu<from->num_lattice_site_var; mu++, j++ )
-            rhs[j] = ((complex_float)vector_in[i+2*mu] + I*(complex_float)vector_in[i+2*mu+1]);
-        }
+            in[j] = (vector_in[i+2*mu] + I*vector_in[i+2*mu+1]);
+          }
+  } else {
+    in = (complex_float*) vector_in;
+    out = (complex_float*) vector_out; 
+  }
 
   switch(_TYPE) {
 
   case _RESTRICT :
     THREADED(threading[0]->n_core)
-      restrict_float( sol, rhs, from, threading[omp_get_thread_num()] );
+      restrict_float( out, in, from, threading[omp_get_thread_num()] );
     break;
 
   case _PROLONGATE :
     THREADED(threading[0]->n_core)
-      interpolate3_float( sol, rhs, to, threading[omp_get_thread_num()] );
+      interpolate3_float( out, in, to, threading[omp_get_thread_num()] );
     break;
 
   case _OPERATOR :
-    THREADED(threading[0]->n_core)
-      apply_operator_float( sol, rhs, &(from->p_float), from, threading[omp_get_thread_num()] );
+    THREADED(threading[0]->n_core)          
+      if ( from->level==0 && g.odd_even )
+        coarse_odd_even_float_test( out, in, from, threading[omp_get_thread_num()] );
+      else
+        apply_operator_float( out, in, &(from->p_float), from, threading[omp_get_thread_num()] );
+
     break;
 
   default :
     warning0("_TYPE not found in DDalphaAMG_driver. Returing vector in as vector out.");
-    sol=rhs;
     break;
   }
 
-  ll = to->local_lattice;
-  for (t=0, j=0; t<ll[T]; t++) 
-    for (z=0; z<ll[Z]; z++) 
-      for (y=0; y<ll[Y]; y++) 
-        for (x=0; x<ll[X]; x++) {
-          if(vector_index_fct!=NULL )
+  if(vector_index_fct!=NULL ) {
+    ll = to->local_lattice;
+    for (t=0, j=0; t<ll[T]; t++) 
+      for (z=0; z<ll[Z]; z++) 
+        for (y=0; y<ll[Y]; y++) 
+          for (x=0; x<ll[X]; x++) {
             i = vector_index_fct( t, z, y, x );
-          else 
-            i = 2*j;
-
-          for ( mu=0; mu<to->num_lattice_site_var; mu++, j++ ) {
-            vector_out[i+2*mu]   = (double) creal(sol[j]);
-            vector_out[i+2*mu+1] = (double) cimag(sol[j]);
+
+            for ( mu=0; mu<to->num_lattice_site_var; mu++, j++ ) {
+              vector_out[i+2*mu]   = creal(out[j]);
+              vector_out[i+2*mu+1] = cimag(out[j]);
+            }
           }
-        }
+  }
 
 
   mg_status->success = 1;
@@ -1761,18 +1765,80 @@ void DDalphaAMG_preconditioner_doublet( double *vector1_out, double *vector1_in,
   set_n_flavours( 1 );
 }
 
-void DDalphaAMG_restrict( double *vector_out, double *vector_in, int level, DDalphaAMG_status *mg_status ) {
+void DDalphaAMG_restrict( float *vector_out, float *vector_in, int level, DDalphaAMG_status *mg_status ) {
   DDalphaAMG_proj_driver( vector_out, vector_in, level, mg_status, _RESTRICT );
 }
 
-void DDalphaAMG_prolongate( double *vector_out, double *vector_in, int level, DDalphaAMG_status *mg_status ) {
+void DDalphaAMG_prolongate( float *vector_out, float *vector_in, int level, DDalphaAMG_status *mg_status ) {
   DDalphaAMG_proj_driver( vector_out, vector_in, level, mg_status, _PROLONGATE );
 }
 
-void DDalphaAMG_apply_coarse_operator( double *vector_out, double *vector_in, int level, DDalphaAMG_status *mg_status ) {
+void DDalphaAMG_apply_coarse_operator( float *vector_out, float *vector_in, int level, DDalphaAMG_status *mg_status ) {
   DDalphaAMG_proj_driver( vector_out, vector_in, level, mg_status, _OPERATOR );
 }
 
+float* DDalphaAMG_coarse_vector_alloc( int level ) {
+  level_struct *ltmp=&l;
+  for(int i=0; i<level; i++)
+    ltmp=ltmp->next_level;
+  vector_float vector = NULL;
+  MALLOC(vector, complex_float, ltmp->vector_size );
+  return (float*) vector;
+}
+
+void DDalphaAMG_coarse_vector_free( float *vector, int level ) {
+  level_struct *ltmp=&l;
+  for(int i=0; i<level; i++)
+    ltmp=ltmp->next_level;
+  FREE(vector, float, 2*ltmp->vector_size );
+}
+
+void DDalphaAMG_coarse_vector_rand( float *vector, int level ) {
+  level_struct *ltmp=&l;
+  for(int i=0; i<level; i++)
+    ltmp=ltmp->next_level;
+
+  THREADED(threading[0]->n_core)
+  if(vector!=NULL){
+    int start, end;
+    compute_core_start_end( 0, ltmp->inner_vector_size, &start, &end, ltmp, threading[omp_get_thread_num()]);
+    vector_float_define_random( (vector_float) vector, start, end, ltmp );
+  }
+  else {
+    warning0("Vector NULL when calling DDalphaAMG_define_vector_const!");
+  }
+}
+
+float DDalphaAMG_coarse_vector_residual( float *vector_out, float *vector_in, int level ) {
+  level_struct *ltmp=&l;
+  for(int i=0; i<level; i++)
+    ltmp=ltmp->next_level;
+
+  float norm1 = 1., norm2 = 1.;
+  THREADED(threading[0]->n_core)
+  if(vector_in!=NULL){
+    norm1 = global_norm_float( (vector_float) vector_in, 0, ltmp->inner_vector_size, ltmp, threading[omp_get_thread_num()] );
+    if(vector_out!=NULL){
+      int start, end;
+      compute_core_start_end( 0, ltmp->inner_vector_size, &start, &end, ltmp, threading[omp_get_thread_num()]);
+      vector_float_minus( (vector_float) vector_out, (vector_float) vector_out, (vector_float) vector_in, start, end, ltmp );
+    }
+  }
+  else {
+    warning0("Vector NULL when calling DDalphaAMG_define_vector_const!");
+  }
+  if(vector_out!=NULL){
+    THREADED(threading[0]->n_core)
+    norm2 = global_norm_float( (vector_float) vector_out, 0, ltmp->inner_vector_size, ltmp, threading[omp_get_thread_num()] );
+  }
+  else {
+    warning0("Vector NULL when calling DDalphaAMG_define_vector_const!");
+    return norm1;
+  }
+
+  return norm2/norm1;
+}
+
 void DDalphaAMG_free( void ) {
   method_free( &l );
   g.setup_flag = 0;

diff --git a/src/main.c b/src/main.c
@@ -21,12 +21,12 @@
 
 #include "main.h"
 
-global_struct g;
+extern global_struct g;
 #ifdef HAVE_HDF5
 Hdf5_fileinfo h5info;
 #endif
-struct common_thread_data *commonthreaddata;
-struct Thread *no_threading;
+extern struct common_thread_data *commonthreaddata;
+extern struct Thread *no_threading;
 
 int main( int argc, char **argv ) {
 

diff --git a/src/setup_generic.c b/src/setup_generic.c
@@ -224,14 +224,10 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T
         vector_PRECISION_define_random( l->is_PRECISION.test_vector[k], 0, l->inner_vector_size, l );
         END_LOCKED_MASTER(threading)
 //       }
-
-      smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], 1, _NO_RES, l, threading );
-      vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l );
-      smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], g.method>=4?1:2, _NO_RES, l, threading );
-      vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l );
-      smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], g.method>=4?1:3, _NO_RES, l, threading );
-      vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l );
-
+      for ( i = 0; i < g.post_smooth_iter[l->depth]; i++ ) {
+        smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], g.method>=4? 1:(i+1), _NO_RES, l, threading );
+        vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l );
+      }
       pc += 6;
 #ifdef DEBUG
       START_MASTER(threading)

diff --git a/tests/DDalphaAMG_proj_test.c → tests/DDalphaAMG_benchmark.c b/tests/DDalphaAMG_proj_test.c → tests/DDalphaAMG_benchmark.c
@@ -26,44 +26,46 @@
 #include <getopt.h>
 #include <mpi.h>
 #include <omp.h>
+#include <math.h>
 
 //library to include
 #include "DDalphaAMG.h"
 
 enum { T, Z, Y, X };
 #define printf0(...) do{if(rank==0)printf(__VA_ARGS__);}while(0)
 
-int rank, max_setup=0, conf_format;
-float residual=1e-9;
+int rank;
+int iters=100;
+int coarsest_local_lattice[4];
 MPI_Comm comm_cart;
 DDalphaAMG_init init;
 DDalphaAMG_parameters params;
 DDalphaAMG_status status;
-char * conf_file = "../conf/8x8x8x8b6.0000id3n1";
-char * options = "c:f:i:L:p:B:l:k:w:u:t:hr:C:K:V:m:s:S:v";
+char * options = "L:p:B:l:i:wmet:h:V:v";
+
 /*
  * Setting standard values for DDalphaAMG_init
  */
 void standard_init() {
-  init.global_lattice[T] = 8;
-  init.global_lattice[Z] = 8;
-  init.global_lattice[Y] = 8;
-  init.global_lattice[X] = 8;
+  coarsest_local_lattice[T] = 1;
+  coarsest_local_lattice[Z] = 1;
+  coarsest_local_lattice[Y] = 1;
+  coarsest_local_lattice[X] = 1;
 
   init.kappa = 0.142857143;
-  init.mu = 0.1;
-  init.csw = 1;
+  init.mu = 0.;
+  init.csw = 0.;
 
   init.bc = 1;
 
   init.number_of_levels = 3;
 
-  init.procs[T] = 2;
+  init.procs[T] = 1;
   init.procs[Z] = 1;
   init.procs[Y] = 1;
   init.procs[X] = 1;
 
-  init.number_openmp_threads = 2;
+  init.number_openmp_threads = 1;
 
   init.block_lattice[T]=2;
   init.block_lattice[Z]=2;
@@ -83,24 +85,17 @@ void help( char * arg0 ) {
   static int printed = 0;
   if(!printed) {
     printf0("\n\n");
-    printf0("Usage: %s -c <conf> [<option(s)>]\n", arg0);
-    printf0("   -c PATH      Configuration to load\n");
-    printf0("   -f #         Configuration format (0 -> DDalphaAMG, 1 -> Lime)\n");
-    printf0("   -i PATH      Input file (optional)\n");
-    printf0("   -L T Y X Z   Lattice size in each direction\n");
-    printf0("   -p T Y X Z   Processors in each direction\n");
-    printf0("   -B T Y X Z   Block size in each direction on first level.\n");
-    printf0("   -k #         kappa for the configuration\n");
-    printf0("   -w #         c_sw for the configuration\n");
-    printf0("   -u #         mu for the configuration\n");
-    printf0("   -t #         Number of OpenMp threads\n");
-    printf0("   -r #         Relative residual\n");
-    printf0("   -K #         K-cycle tolerance\n");
-    printf0("   -C #         Tolerance on coarsest grid\n");
+    printf0("Usage: %s [<option(s)>]\n", arg0);
+    printf0("   -L T Z Y X   Local lattice size on the coarsest level in each direction\n");
+    printf0("   -p T Z Y X   Processors in each direction\n");
+    printf0("   -B T Z Y X   Block size in each direction on first level. A block size of (2 2 2 2) is used for the following levels.\n");
     printf0("   -l #         Number of levels, l (from 1 to 4)\n");
-    printf0("   -V 1 [2] [3] Basis vectors between each level (l-1)\n");
-    printf0("   -m 2 [3] [4] Factor for mu on coarse levels\n");
-    printf0("   -s 1 [2] [3] Setup iterations on each level (l-1)\n");
+    printf0("   -i #         Number of iterations\n");
+    printf0("   -w           use c_sw term (without the coarse self-coupling is diagonal)\n");
+    printf0("   -m           use twisted mass term (without the coarse self-coupling is hermitian)\n");
+    printf0("   -e           use ND twisted mass term (this double the size of the vectors)\n");
+    printf0("   -t #         Number of OpenMp threads\n");
+    printf0("   -V 1 [2] [3] Basis vectors between each level (l-1 numbers)\n");
     printf0("   -v           Verbose\n");
   }
   printf0("\n\n");
@@ -116,15 +111,6 @@ void read_init_arg(int argc, char *argv[] ) {
   optind = 0;
   while ((opt = getopt(argc, argv, options)) != -1) {
     switch (opt) {
-    case 'c':
-      conf_file = optarg;
-      break;
-    case 'f':
-      conf_format = atoi(optarg);
-      break;
-    case 'i':
-      init.init_file = optarg;
-      break;
     case 'L':
       optind--;
       mu=0;
@@ -135,7 +121,7 @@ void read_init_arg(int argc, char *argv[] ) {
 	  fail++;
 	  break;
 	}
-	init.global_lattice[mu] = atoi(argv[optind]);
+	coarsest_local_lattice[mu] = atoi(argv[optind]);
 	mu++;        
       }
       if(mu < 4) {
@@ -183,14 +169,14 @@ void read_init_arg(int argc, char *argv[] ) {
     case 'l':
       init.number_of_levels = atoi(optarg);
       break;
-    case 'k':
-      init.kappa = atof(optarg);
+    case 'i':
+      iters = atoi(optarg);
       break;
     case 'w':
-      init.csw = atof(optarg);
+      init.csw = 1.;
       break;
-    case 'u':
-      init.mu = atof(optarg);
+    case 'm':
+      init.mu = 1.;
       break;
     case 't':
       init.number_openmp_threads = atoi(optarg);
@@ -203,12 +189,6 @@ void read_init_arg(int argc, char *argv[] ) {
       break;
     }
   }
-
-  if(conf_file == NULL) {
-    printf0("Error: configuration file is missing (use -c PATH).\n");
-    p++;
-    fail++;
-  }
 
   if(p) {
     help(argv[0]);
@@ -224,15 +204,6 @@ void read_params_arg(int argc, char *argv[] ) {
   optind = 0;
   while ((opt = getopt(argc, argv, options)) != -1) {
     switch (opt) {
-    case 'r':
-      residual = atof(optarg);
-      break;
-    case 'K':
-      params.kcycle_tolerance = atof(optarg);
-      break;
-    case 'C':
-      params.coarse_tolerance = atof(optarg);
-      break;
     case 'V':
       optind--;
       mu=0;
@@ -247,50 +218,17 @@ void read_params_arg(int argc, char *argv[] ) {
 	mu++;        
       }
       break;
-    case 'm':
-      optind--;
-      params.mu_factor[0]=1;
-      mu=1;
-      for ( ; optind < argc && *argv[optind] != '-'; optind++){
-	if(mu > 3) {
-	  printf0("Error: too many arguments in -m.\n");
-	  p++;
-	  fail++;
-	  break;
-	}
-	params.mu_factor[mu] = atof(argv[optind]);
-	mu++;        
-      }
-      break;
-    case 's':
-      optind--;
-      mu=0;
-      for ( ; optind < argc && *argv[optind] != '-'; optind++){
-	if(mu > 2) {
-	  printf0("Error: too many arguments in -s.\n");
-	  p++;
-	  fail++;
-	  break;
-	}
-	params.setup_iterations[mu] = atoi(argv[optind]);
-	mu++;        
-      }
+    case 'e':
+      params.epsbar = 1;
       break;
     case 'v':
       params.print = 1;
       break;
-    case '?':
     default: 
       break;
     }
   }
 
-  if(conf_file == NULL) {
-    printf0("Error: configuration file is missing (use -c PATH).\n");
-    p++;
-    fail++;
-  }
-
   if(p) {
     help(argv[0]);
     MPI_Abort(MPI_COMM_WORLD,0);
@@ -300,24 +238,37 @@ void read_params_arg(int argc, char *argv[] ) {
 }
 
 int main( int argc, char *argv[] ) {
-
 
   MPI_Init( &argc, &argv );
   MPI_Comm_rank( MPI_COMM_WORLD, &rank );
 
   standard_init();
   read_init_arg(argc, argv);
 
+  for(int mu=0; mu<4; mu++) {
+    init.global_lattice[mu] = coarsest_local_lattice[mu]*init.procs[mu]*init.block_lattice[mu];
+    for(int l=0; l<init.number_of_levels-2; l++) {
+      init.global_lattice[mu] *= 2;
+    }
+  }
+  printf0("global size: %d %d %d %d\n",init.global_lattice[T],init.global_lattice[X],init.global_lattice[Y],init.global_lattice[Z]);
+
   printf0("Running initialization...\n");
   DDalphaAMG_initialize( &init, &params, &status );
   printf0("Initialized %d levels in %.2f sec\n", status.success, status.time);
 
   int nlvl = status.success;
   read_params_arg(argc, argv);
-
+  
   comm_cart =  DDalphaAMG_get_communicator();
   MPI_Comm_rank( comm_cart, &rank );
 
+  // we don't want to waste time with the setup
+  for(int l=0; l<nlvl; l++) {
+    params.setup_iterations[l]=0;
+    params.smoother_iterations=0;
+  }
+
   printf0("Running updating\n");
   DDalphaAMG_update_parameters( &params, &status );
   if (status.success)
@@ -330,83 +281,92 @@ int main( int argc, char *argv[] ) {
   double *gauge_field;
   int vol = init.global_lattice[T] * init.global_lattice[X] * init.global_lattice[Y] *
     init.global_lattice[Z] / init.procs[T] / init.procs[X] / init.procs[Y] / init.procs[Z];
-  gauge_field = (double *) malloc(18*4*vol*sizeof(double));
+  gauge_field = (double *) calloc(18*4*vol,sizeof(double));
 
-  printf0("Reading config.\n");
-  DDalphaAMG_read_configuration( gauge_field, conf_file, conf_format, &status );
-  printf0("Reading configuration time %.2f sec\n", status.time);
-  printf0("Desired plaquette %.13lf\n", status.info);
+  printf0("Using unitary config config.\n");
+  for(int i=0; i<4*vol; i++) {
+    gauge_field[i*18+0] = 1.;
+    gauge_field[i*18+8] = 1.;
+    gauge_field[i*18+16] = 1.;
+  }
 
   printf0("Setting config.\n");
   DDalphaAMG_set_configuration( gauge_field, &status );
   printf0("Setting configuration time %.2f sec\n", status.time);
   printf0("Computed plaquette %.13lf\n", status.info);
-
+  free(gauge_field);
 
   printf0("Running setup\n");
   DDalphaAMG_setup( &status );
   printf0("Run %d setup iterations in %.2f sec (%.1f %% on coarse grid)\n", status.success,
 	  status.time, 100.*(status.coarse_time/status.time));
   printf0("Total iterations on fine grid %d\n", status.iter_count);
   printf0("Total iterations on coarse grids %d\n", status.coarse_iter_count);
- 
+
   /*
    * Defining fine and coarse vector randomly.
    */
-  double *vector1[nlvl], *vector2[nlvl];
-  int vols[nlvl], vars[nlvl];
-  vols[0]=vol;
-  vars[0]=3*4*2;
-
-  for ( int i=1; i<nlvl; i++ ) {
-    vols[i] = vols[i-1] / params.block_lattice[i-1][T] / params.block_lattice[i-1][X] / params.block_lattice[i-1][Y] / params.block_lattice[i-1][Z];
-    vars[i]=params.mg_basis_vectors[i-1]*2*2; // a factor of 2 is for the spin, the other for the complex
-  }
+  float *vector1[nlvl], *vector2[nlvl];
 
   for ( int i=0; i<nlvl; i++ ) {
-    vector1[i] = (double *) malloc(vars[i]*vols[i]*sizeof(double));
-    vector2[i] = (double *) malloc(vars[i]*vols[i]*sizeof(double));
+    vector1[i] = DDalphaAMG_coarse_vector_alloc(i);
+    vector2[i] = DDalphaAMG_coarse_vector_alloc(i);
+    DDalphaAMG_coarse_vector_rand(vector1[i], i);
   }
 
-  for ( int i=0; i<nlvl; i++ )
-    for ( int j=0; j<vars[i]*vols[i]; j++ )
-      vector1[i][j] = ((double)rand()/(double)RAND_MAX)-0.5;
-
-
   for ( int i=1; i<nlvl; i++ ) {
-    printf0("Testing RP=1 on level %d\n",i);
+    printf0("\nTesting RP=1 on level %d\n",i);
     DDalphaAMG_prolongate(vector2[i-1], vector1[i], i-1, &status);
     DDalphaAMG_restrict(vector2[i], vector2[i-1], i-1, &status);
 
-    double num=0, den=0;
-    for ( int j=0; j<vars[i]*vols[i]; j++ ) {
-      vector2[i][j] -= vector1[i][j];
-      num += vector2[i][j]*vector2[i][j];
-      den += vector1[i][j]*vector1[i][j];
-    }
-    printf0("Restult (1-RP)v = %e\n\n", num/den);
+    float res = DDalphaAMG_coarse_vector_residual(vector2[i], vector1[i], i);
+    printf0("Result (1-RP)v = %e\n", res);
   }
 
   for ( int i=1; i<nlvl; i++ ) {
-    printf0("Testing coarse operator on level %d\n",i);
+    printf0("\nTesting coarse operator on level %d\n",i);
+    DDalphaAMG_apply_coarse_operator(vector2[i], vector1[i], i, &status);
+
     DDalphaAMG_prolongate(vector1[i-1], vector1[i], i-1, &status);
     DDalphaAMG_apply_coarse_operator(vector2[i-1], vector1[i-1], i-1, &status);
-    DDalphaAMG_restrict(vector2[i], vector2[i-1], i-1, &status);
+    DDalphaAMG_restrict(vector1[i], vector2[i-1], i-1, &status);
+
+    float res = DDalphaAMG_coarse_vector_residual(vector2[i], vector1[i], i);
+    printf0("Result (D_c-RDP)v = %e\n", res);
 
-    DDalphaAMG_apply_coarse_operator(vector1[i], vector1[i], i, &status);
+    // restoring vector1[i]
+    DDalphaAMG_restrict(vector1[i], vector1[i-1], i-1, &status);
+  }
 
-    double num=0, den=0;
-    for ( int j=0; j<vars[i]*vols[i]; j++ ) {
-      vector2[i][j] -= vector1[i][j];
-      num += vector2[i][j]*vector2[i][j];
-      den += vector1[i][j]*vector1[i][j];
+  printf0("\nStarting bechmarks:\n");
+  for ( int i=1; i<nlvl; i++ ) {
+    double time = 0;
+    for ( int j=0; j<iters; j++ ) {
+      DDalphaAMG_prolongate(vector1[i-1], vector1[i], i-1, &status);
+      time += status.time;
+    }
+    printf0("Level %d, prolongation from level %d to level %d: %e averaged over %d iters\n", i, i, i-1, time/iters, iters);
+    time = 0;
+    for ( int j=0; j<iters; j++ ) {
+      DDalphaAMG_restrict(vector1[i], vector1[i-1], i-1, &status);
+      time += status.time;
+    }
+    printf0("Level %d, restriction from level %d to level %d: %e averaged over %d iters\n", i, i-1, i, time/iters, iters);
+    time = 0;
+    for ( int j=0; j<iters; j++ ) {
+      DDalphaAMG_apply_coarse_operator(vector1[i], vector1[i], i, &status);
+      time += status.time;
     }
-    printf0("Restult (D_c-RDP)v = %e\n\n", num/den);
+    printf0("Level %d, coarse operator on level %d: %e averaged over %d iters\n", i, i, time/iters, iters);
+  }
+
+
+  for ( int i=0; i<nlvl; i++ ) {
+    DDalphaAMG_coarse_vector_free(vector1[i], i);
+    DDalphaAMG_coarse_vector_free(vector2[i], i);
   }
 
-  //  free(vector_in);
-  // free(vector_out);
-  //free(gauge_field);
-  //  DDalphaAMG_finalize();
+  DDalphaAMG_finalize();
   MPI_Finalize();
+  return 0;
 }