From 57093724d91de52ea8774ce1ecadd8a4221e71f6 Mon Sep 17 00:00:00 2001
From: Thilina Ratnayaka <thilinarmtb@users.noreply.github.com>
Date: Tue, 7 Nov 2023 02:13:45 -0600
Subject: [PATCH] Parrsb fixes and cleanup v4 (#9) (#72)

---
 .github/workflows/ci.yml     |   17 +-
 .github/workflows/coarse.yml |   55 --
 .github/workflows/ilu.yml    |   74 --
 .github/workflows/ilu0.m     |   11 -
 .github/workflows/iluc.m     |   10 -
 Makefile                     |   10 +-
 examples/coarse.c            |  153 ----
 examples/genmap.c            |    2 +-
 examples/ilu.c               |   36 -
 src/coarse-impl.h            |   31 -
 src/coarse-laplacian.c       |  293 -------
 src/coarse.c                 |  394 ---------
 src/coarse.h                 |   28 -
 src/components.c             |   60 +-
 src/con-check.c              |   40 +-
 src/con-impl.h               |    7 +-
 src/con-periodic.c           |   29 +-
 src/con-unique-vertices.c    |  153 +++-
 src/con.c                    |  159 ++--
 src/fiedler.c                |   63 +-
 src/helpers.c                |  104 +--
 src/ilu.c                    | 1513 ----------------------------------
 src/ilu.h                    |   24 -
 src/io.c                     |  100 +--
 src/laplacian.c              |   15 +-
 src/mat.c                    |  172 ++--
 src/mat.h                    |    3 +-
 src/metrics.c                |   22 +-
 src/metrics.h                |   14 +-
 src/multigrid.c              |  226 ++---
 src/multigrid.h              |    2 +-
 src/parRSB.h                 |   44 +-
 src/parrsb-impl.h            |   51 +-
 src/parrsb.c                 |  972 ++++++++++++++++++++++
 src/rcb.c                    |   21 +-
 src/rib.c                    |   19 +-
 src/rsb-aux.c                |  396 ---------
 src/rsb.c                    |  572 ++++++++-----
 src/schur.c                  | 1264 ----------------------------
 src/sort-bin.c               |   52 ++
 src/sort-hypercube.c         |  150 ++++
 src/sort-impl.h              |   34 +
 src/sort.c                   |  381 +++------
 src/sort.h                   |   56 +-
 src/statistics.c             |  230 ++++++
 45 files changed, 2566 insertions(+), 5496 deletions(-)
 delete mode 100644 .github/workflows/coarse.yml
 delete mode 100644 .github/workflows/ilu.yml
 delete mode 100644 .github/workflows/ilu0.m
 delete mode 100644 .github/workflows/iluc.m
 delete mode 100644 examples/coarse.c
 delete mode 100644 examples/ilu.c
 delete mode 100644 src/coarse-impl.h
 delete mode 100644 src/coarse-laplacian.c
 delete mode 100644 src/coarse.c
 delete mode 100644 src/coarse.h
 delete mode 100644 src/ilu.c
 delete mode 100644 src/ilu.h
 create mode 100644 src/parrsb.c
 delete mode 100644 src/rsb-aux.c
 delete mode 100644 src/schur.c
 create mode 100644 src/sort-bin.c
 create mode 100644 src/sort-hypercube.c
 create mode 100644 src/sort-impl.h
 create mode 100644 src/statistics.c

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b66b2c2a..b14383b4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -57,6 +57,7 @@ jobs:
         shell: bash
         run: |
           export PARRSB_RSB_ALGO=0
+          export PARRSB_VERBOSE_LEVEL=2
 
           cp ${EXAMPLESDIR}/genmap ${CIDIR}/${{ matrix.test }}
           cd ${CIDIR}/${{ matrix.test }}
@@ -70,6 +71,7 @@ jobs:
         run: |
           export PARRSB_RSB_ALGO=1
           export PARRSB_RSB_MG_FACTOR=2
+          export PARRSB_VERBOSE_LEVEL=2
 
           cp ${EXAMPLESDIR}/genmap ${CIDIR}/${{ matrix.test }}
           cd ${CIDIR}/${{ matrix.test }}
@@ -83,20 +85,7 @@ jobs:
         run: |
           export PARRSB_RSB_ALGO=1
           export PARRSB_RSB_MG_FACTOR=4
-
-          cp ${EXAMPLESDIR}/genmap ${CIDIR}/${{ matrix.test }}
-          cd ${CIDIR}/${{ matrix.test }}
-
-          tol=(`cat test.txt | grep tol`); tol=${tol[2]}
-          ${MPIEXE} -np ${{ matrix.np }} ./genmap --mesh ${{ matrix.test }} \
-            --tol=${tol} --dump=0 --test=1
-      - name: genmap-mg-factor-4-smooth
-        if: always()
-        shell: bash
-        run: |
-          export PARRSB_RSB_ALGO=1
-          export PARRSB_RSB_MG_FACTOR=4
-          export PARRSB_RSB_MG_SMOOTH_AGGREGATION=1
+          export PARRSB_VERBOSE_LEVEL=2
 
           cp ${EXAMPLESDIR}/genmap ${CIDIR}/${{ matrix.test }}
           cd ${CIDIR}/${{ matrix.test }}
diff --git a/.github/workflows/coarse.yml b/.github/workflows/coarse.yml
deleted file mode 100644
index 3d19b685..00000000
--- a/.github/workflows/coarse.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-name: Coarse tests
-on:
-  push:
-    branch: [main]
-  pull_request:
-    branch: [main]
-env:
-  GITHUB.TOKEN: ${{ secrets.token }}
-  CIDIR: parRSB-github-ci
-  EXAMPLESDIR: build/examples
-  MPIEXE: "mpirun --oversubscribe"
-jobs:
-  coarse:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        test: [box_2x2x2, box_10x1x1, box_3x5x7, pyramid, solid, ethier, vortex, expansion]
-        np: [2, 3, 4]
-      fail-fast: false
-    name: "Coarse: ${{ matrix.test }}, NP = ${{ matrix.np }}"
-    env:
-      GSVER: 1.0.7
-      CC: mpicc
-    steps:
-      - uses: actions/checkout@v3
-      - name: Install apt dependencies
-        shell: bash
-        run: |
-          sudo apt -y update
-          sudo apt install -y openmpi-bin libopenmpi-dev
-          sudo apt install -y libblas-dev liblapack-dev
-          sudo apt install -y build-essential
-      - name: Build parRSB and clone tests
-        shell: bash
-        run: |
-          # Build gslib
-          git clone https://github.com/Nek5000/gslib.git
-          make -C gslib -j4
-
-          # Build parRSB
-          export GSLIBPATH=`pwd`/gslib/build/
-          make -j4 examples
-
-          # Clone tests
-          git clone https://github.com/thilinarmtb/${CIDIR}.git
-      - name: schur
-        if: always()
-        shell: bash
-        run: |
-          cp ${EXAMPLESDIR}/coarse ${CIDIR}/${{ matrix.test }}
-          cd ${CIDIR}/${{ matrix.test }}
-
-          tol=(`cat test.txt | grep tol`); tol=${tol[2]}
-          ${MPIEXE} -np ${{ matrix.np }} ./coarse --mesh ${{ matrix.test }} \
-            --tol=${tol} --crs_tol=1e-12
diff --git a/.github/workflows/ilu.yml b/.github/workflows/ilu.yml
deleted file mode 100644
index 763d1c18..00000000
--- a/.github/workflows/ilu.yml
+++ /dev/null
@@ -1,74 +0,0 @@
-name: ILU tests
-on:
-  push:
-    branch: [main]
-  pull_request:
-    branch: [main]
-env:
-  GITHUB.TOKEN: ${{ secrets.token }}
-  CIDIR: parRSB-github-ci
-  EXAMPLESDIR: build/examples
-  MPIEXE: "mpirun --oversubscribe"
-jobs:
-  ilu:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        test: [box_2x2x2, box_10x1x1, box_3x5x7, pyramid, solid, ethier, vortex, expansion]
-        np: [1, 2, 3, 4]
-      fail-fast: false
-    name: "ILU: ${{ matrix.test }}, NP = ${{ matrix.np }}"
-    env:
-      GSVER: 1.0.7
-      CC: mpicc
-    steps:
-      - uses: actions/checkout@v3
-      - name: Install apt dependencies
-        shell: bash
-        run: |
-          sudo apt -y update
-          sudo apt install -y openmpi-bin libopenmpi-dev
-          sudo apt install -y libblas-dev liblapack-dev
-          sudo apt install -y build-essential
-          sudo apt install -y octave
-      - name: Build parRSB and clone tests
-        shell: bash
-        run: |
-          # Build gslib
-          git clone https://github.com/Nek5000/gslib.git
-          make -C gslib -j4
-
-          # Build parRSB
-          export GSLIBPATH=`pwd`/gslib/build/
-          make -j4 examples
-
-          # Clone tests
-          git clone https://github.com/thilinarmtb/${CIDIR}.git
-      - name: ilu0
-        if: always()
-        shell: bash
-        run: |
-          export PARRSB_DUMP_ILU=1
-
-          cp ${EXAMPLESDIR}/ilu .github/workflows/ilu0.m ${CIDIR}/${{ matrix.test }}
-          cd ${CIDIR}/${{ matrix.test }}
-
-          tol=(`cat test.txt | grep tol`); tol=${tol[2]}
-          ${MPIEXE} -np ${{ matrix.np }} ./ilu --mesh ${{ matrix.test }} \
-            --tol=${tol} --ilu_type=0
-
-          octave-cli ilu0.m
-      - name: iluc
-        if: always()
-        shell: bash
-        run: |
-          export PARRSB_DUMP_ILU=1
-
-          cp ${EXAMPLESDIR}/ilu .github/workflows/iluc.m ${CIDIR}/${{ matrix.test }}
-          cd ${CIDIR}/${{ matrix.test }}
-
-          tol=(`cat test.txt | grep tol`); tol=${tol[2]}
-          ${MPIEXE} -np ${{ matrix.np }} ./ilu --mesh ${{ matrix.test }} \
-            --tol=${tol} --ilu_type=1
-
-          octave-cli iluc.m
diff --git a/.github/workflows/ilu0.m b/.github/workflows/ilu0.m
deleted file mode 100644
index 92e959bc..00000000
--- a/.github/workflows/ilu0.m
+++ /dev/null
@@ -1,11 +0,0 @@
-load 'A.txt';
-load 'B.txt';
-A = spconvert(A);
-B = spconvert(B);
-
-[L, U] = ilu(A);
-n = size(A, 1);
-I = speye(n);
-err = norm(L + U - B - I, Inf);
-printf('ILU err = %f', err);
-assert(err < 1e-8);
diff --git a/.github/workflows/iluc.m b/.github/workflows/iluc.m
deleted file mode 100644
index cba26949..00000000
--- a/.github/workflows/iluc.m
+++ /dev/null
@@ -1,10 +0,0 @@
-load 'A.txt'
-load 'LL.txt'
-load 'UU.txt'
-
-A = spconvert(A);
-LL = spconvert(LL);
-UU = spconvert(UU);
-err = norm(A - LL * UU, Inf);
-printf('LU error: %f\n', err);
-assert(err < 1e-8);
diff --git a/Makefile b/Makefile
index 555766cd..60a845f8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 CC ?= mpicc
-CFLAGS ?=
+CFLAGS ?= -Wall -Wextra -Wpedantic -Wno-unused-function -Wno-unused-parameter -std=c99
 LDFLAGS ?=
 DEBUG ?= 0
 MPI ?= 1
@@ -8,6 +8,7 @@ SYNC_BY_REDUCTION ?= 1
 BLAS ?= 0
 BLASDIR ?=
 BLASFLAGS ?= -lblas -llapack
+GSLIBPATH ?=
 
 ########################## Don't touch what follows ###########################
 ifeq ($(GSLIBPATH),)
@@ -19,9 +20,10 @@ SRCROOT := $(realpath $(patsubst %/,%,$(dir $(MKFILEPATH))))
 SRCDIR = $(SRCROOT)/src
 EXAMPLEDIR = $(SRCROOT)/examples
 BUILDROOT = $(SRCROOT)/build
-INSTALLROOT = $(BUILDROOT)/install
-ifneq ($(strip $(DESTDIR)),)
-  INSTALLROOT = $(realpath $(DESTDIR))
+ifneq (,$(strip $(DESTDIR)))
+INSTALLROOT = $(DESTDIR)
+else
+INSTALLROOT = $(SRCROOT)/install
 endif
 
 SRCS = $(wildcard $(SRCDIR)/*.c)
diff --git a/examples/coarse.c b/examples/coarse.c
deleted file mode 100644
index 67909113..00000000
--- a/examples/coarse.c
+++ /dev/null
@@ -1,153 +0,0 @@
-//=============================================================================
-// Test Schur complement solver
-//
-#include "coarse.h"
-#include "parRSB.h"
-
-#include <math.h>
-#include <time.h>
-
-static double check_err(double *b, double *x, uint nelt, uint nv,
-                        const slong *vtx, MPI_Comm comm) {
-  struct comm c;
-  comm_init(&c, comm);
-
-  slong out[2][1], buf[2][1], in = nelt;
-  comm_scan(out, &c, gs_long, gs_add, &in, 1, buf);
-  ulong start = out[0][0] + 1;
-
-  ulong *eid = tcalloc(ulong, nelt);
-  for (uint i = 0; i < nelt; i++)
-    eid[i] = start + i;
-
-  struct crystal cr;
-  crystal_init(&cr, &c);
-
-  buffer bfr;
-  buffer_init(&bfr, 1024);
-
-  struct array nbrs, eij;
-  find_nbrs(&nbrs, eid, vtx, nelt, nv, &cr, &bfr);
-  compress_nbrs(&eij, &nbrs, &bfr);
-
-  struct par_mat M;
-  par_csr_setup(&M, &eij, 1, &bfr);
-  assert(M.rn > 0);
-
-  free(eid), array_free(&nbrs), array_free(&eij);
-
-  struct gs_data *gsh = setup_Q(&M, &c, &bfr);
-  double *bl = tcalloc(double, nelt);
-  double *wrk = tcalloc(double, M.rn + M.adj_off[M.rn]);
-  mat_vec_csr(bl, x, &M, gsh, wrk, &bfr);
-
-  crystal_free(&cr), comm_free(&c);
-  gs_free(gsh), par_mat_free(&M);
-
-  double norm = 0.0;
-  for (uint i = 0; i < nelt; i++)
-    norm += (bl[i] - b[i]) * (bl[i] - b[i]);
-  MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE, MPI_SUM, comm);
-
-  free(wrk), free(bl);
-  buffer_free(&bfr);
-
-  return sqrt(norm);
-}
-
-static void setup_rhs(double *b, const unsigned int nelt, MPI_Comm comm) {
-  srand(time(NULL));
-  double sum = 0;
-  for (int i = 0; i < nelt; i++) {
-    b[i] = (rand() % 50 + 1.0) / 10;
-    sum += b[i];
-  }
-  MPI_Allreduce(MPI_IN_PLACE, &sum, 1, MPI_DOUBLE, MPI_SUM, comm);
-
-  long long ng = nelt;
-  MPI_Allreduce(MPI_IN_PLACE, &ng, 1, MPI_LONG_LONG, MPI_SUM, comm);
-  sum /= ng;
-
-  double norm = 0;
-  for (int i = 0; i < nelt; i++) {
-    b[i] -= sum;
-    norm += b[i] * b[i];
-  }
-
-  MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE, MPI_SUM, comm);
-  norm = sqrt(norm);
-
-  for (int i = 0; i < nelt; i++)
-    b[i] /= norm;
-}
-
-static void setup_and_solve(unsigned nelt, unsigned nv, const long long *vl,
-                            const scalar *centroids,
-                            const parrsb_cmd_line_opts *in, MPI_Comm comm) {
-  // Setup the coarse solve with schur complement solver
-  struct comm c;
-  comm_init(&c, comm);
-
-  comm_barrier(&c);
-  double t = comm_time();
-  struct coarse *crs =
-      coarse_setup(nelt, nv, vl, centroids, 1, in->crs_type, &c);
-  double tsetup = comm_time() - t;
-
-  scalar *b = tcalloc(scalar, 2 * nelt);
-  setup_rhs(b, nelt, comm);
-
-  comm_barrier(&c);
-  t = comm_time();
-  scalar *x = b + nelt;
-  coarse_solve(x, crs, b, in->crs_tol);
-  double tsolve = MPI_Wtime() - t;
-
-  double enorm = check_err(b, x, nelt, nv, vl, comm);
-  if (c.id == 0) {
-    printf("MPI Ranks = %d\ncoarse_setup: %lf\ncoarse_solve = %lf\nerr = %lf\n",
-           c.np, tsetup, tsolve, enorm);
-    fflush(stdout);
-  }
-  int err = (enorm > 10 * in->crs_tol);
-  parrsb_check_error(err, comm);
-
-  // Free resources
-  coarse_free(crs), free(b);
-  comm_free(&c);
-}
-
-int main(int argc, char *argv[]) {
-  MPI_Init(&argc, &argv);
-  MPI_Comm comm = MPI_COMM_WORLD;
-
-  parrsb_cmd_line_opts *in = parrsb_parse_cmd_opts(argc, argv);
-  parrsb_check_error(in == NULL, comm);
-
-  // Read the geometry from the .re2 file, find connectiviy, partition and then
-  // distribute the mesh.
-  unsigned nelt, nv;
-  long long *vl = NULL;
-  double *coord = NULL;
-  int err = parrsb_setup_mesh(&nelt, &nv, &vl, &coord, in, comm);
-  parrsb_check_error(err, comm);
-
-  int ndim = (nv == 8) ? 3 : 2;
-  double *centroids = tcalloc(double, nelt *ndim);
-  for (uint i = 0; i < nelt; i++) {
-    for (int j = 0; j < nv; j++) {
-      for (int d = 0; d < ndim; d++)
-        centroids[i * ndim + d] += coord[i * ndim * nv + j * ndim + d];
-    }
-    for (int d = 0; d < ndim; d++)
-      centroids[i * ndim + d] /= nv;
-  }
-
-  setup_and_solve(nelt, nv, vl, centroids, in, comm);
-
-  free(vl), free(coord), free(centroids);
-  parrsb_cmd_opts_free(in);
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/examples/genmap.c b/examples/genmap.c
index a5307487..90941149 100644
--- a/examples/genmap.c
+++ b/examples/genmap.c
@@ -51,7 +51,7 @@ int main(int argc, char *argv[]) {
     parrsb_check_error(part == NULL, comm);
 
     parrsb_options options = parrsb_default_options;
-    err = parrsb_part_mesh(part, NULL, vl, coord, nelt, nv, options, comm);
+    err = parrsb_part_mesh(part, vl, coord, NULL, nelt, nv, &options, comm);
     parrsb_check_error(err, comm);
 
     // Redistribute data based on identified partitions
diff --git a/examples/ilu.c b/examples/ilu.c
deleted file mode 100644
index 0ade8e51..00000000
--- a/examples/ilu.c
+++ /dev/null
@@ -1,36 +0,0 @@
-//=============================================================================
-// Test ILU factorization
-//
-#include "ilu.h"
-#include "parRSB.h"
-
-int main(int argc, char *argv[]) {
-  MPI_Init(&argc, &argv);
-  MPI_Comm comm = MPI_COMM_WORLD;
-
-  parrsb_cmd_line_opts *in = parrsb_parse_cmd_opts(argc, argv);
-  parrsb_check_error(in == NULL, comm);
-
-  // Read the geometry from the .re2 file, find connectiviy, partition and then
-  // distribute the mesh.
-  unsigned int nelt, nv;
-  long long *vl = NULL;
-  double *coord = NULL;
-  parrsb_setup_mesh(&nelt, &nv, &vl, &coord, in, comm);
-
-  // Setup ILU
-  ilu_options iluopt = {.type = in->ilu_type,
-                        .tol = in->ilu_tol,
-                        .pivot = in->ilu_pivot,
-                        .verbose = in->verbose,
-                        .nnz_per_row = 0};
-  struct ilu *ilu = ilu_setup(nelt, nv, vl, &iluopt, comm);
-  ilu_free(ilu);
-
-  // Free resources
-  free(vl), free(coord);
-  parrsb_cmd_opts_free(in);
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/src/coarse-impl.h b/src/coarse-impl.h
deleted file mode 100644
index 65f8ff9c..00000000
--- a/src/coarse-impl.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef _PARRSB_COARSE_IMPL_H_
-#define _PARRSB_COARSE_IMPL_H_
-
-#include "coarse.h"
-
-uint unique_ids(sint *perm, ulong *uid, uint n, const ulong *ids, buffer *bfr);
-
-struct coarse {
-  unsigned type;       // type = schur-2-lvl, schur-3-lvl
-  unsigned null_space; // Is there a null space or not
-  uint un;             // User vector size
-  uint cn;   // Compressed (ignoring duplicates and zero global ids) vector size
-  uint an;   // Assembled size -- this is the local size of the assmebled coarse
-             // matrix
-  sint *u2c; // Mapping from user vector to compress vector
-  struct gs_data *c2a; // Mapping from compressed vector to assmbled vector
-  buffer bfr;
-
-  ulong s[3], ng[3];
-  uint n[3];
-  struct comm c;
-  void *solver;
-};
-
-int schur_setup(struct coarse *crs, struct array *eij, struct crystal *cr,
-                buffer *bfr);
-int schur_solve(scalar *x, struct coarse *crs, scalar *b, scalar tol,
-                buffer *bfr);
-int schur_free(struct coarse *crs);
-
-#endif
diff --git a/src/coarse-laplacian.c b/src/coarse-laplacian.c
deleted file mode 100644
index 9dc2dd6d..00000000
--- a/src/coarse-laplacian.c
+++ /dev/null
@@ -1,293 +0,0 @@
-#include "coarse-impl.h"
-#include "metrics.h"
-#include <float.h>
-
-//------------------------------------------------------------------------------
-// Setup coarse grid system. Initial dumb API.
-//
-// Number rows, local first then interface. Returns global number of local
-// elements.
-struct rcb_t {
-  uint i, s;
-  double coord[3];
-  slong vtx[8];
-};
-
-static void nmbr_local_rcb(struct array *a, uint s, uint e, const unsigned nc,
-                           const unsigned ndim, const unsigned level,
-                           struct comm *c, buffer *bfr) {
-  sint size = e - s;
-  if (size <= 1)
-    return;
-
-  double max[3] = {-DBL_MAX, -DBL_MAX, -DBL_MAX},
-         min[3] = {DBL_MAX, DBL_MAX, DBL_MAX};
-
-  struct rcb_t *pa = (struct rcb_t *)a->ptr;
-  for (uint i = s; i < e; i++) {
-    for (int j = 0; j < ndim; j++) {
-      if (pa[i].coord[j] < min[j])
-        min[j] = pa[i].coord[j];
-      if (pa[i].coord[j] > max[j])
-        max[j] = pa[i].coord[j];
-    }
-  }
-
-  double len = max[0] - min[0];
-  int axis = 0;
-  for (int j = 1; j < ndim; j++) {
-    if (max[j] - min[j] > len)
-      axis = j, len = max[j] - min[j];
-  }
-
-  struct rcb_t *ps = pa + s;
-  switch (axis) {
-  case 0:
-    sarray_sort(struct rcb_t, ps, size, coord[0], 3, bfr);
-    break;
-  case 1:
-    sarray_sort(struct rcb_t, ps, size, coord[1], 3, bfr);
-    break;
-  case 2:
-    sarray_sort(struct rcb_t, ps, size, coord[2], 3, bfr);
-    break;
-  default:
-    break;
-  }
-
-  // Number the elements in the interface
-  uint npts = size * nc;
-  slong *vtx = tcalloc(slong, npts);
-  for (uint i = s, k = 0; i < e; i++) {
-    for (int j = 0; j < nc; j++, k++)
-      vtx[k] = pa[i].vtx[j];
-  }
-
-  struct gs_data *gsh = gs_setup(vtx, npts, c, 0, gs_pairwise, 0);
-
-  sint *dof = tcalloc(sint, npts);
-  uint mid = (s + e) / 2;
-  for (uint i = mid, k = (mid - s) * nc; i < e; i++) {
-    for (int j = 0; j < nc; j++, k++)
-      dof[k] = 1;
-  }
-
-  gs(dof, gs_int, gs_add, 0, gsh, bfr);
-
-  for (uint i = mid, k = (mid - s) * nc; i < e; i++) {
-    for (int j = 0; j < nc; j++, k++)
-      dof[k] = 0;
-  }
-
-  gs(dof, gs_int, gs_add, 0, gsh, bfr);
-
-  for (uint i = s, k = 0; i < e; i++, k++) {
-    for (int j = 0; j < nc; j++) {
-      if (dof[k * nc + j] > 0 && pa[i].s == INT_MAX) {
-        pa[i].s = level;
-        break;
-      }
-    }
-  }
-
-  gs_free(gsh);
-  free(dof), free(vtx);
-
-  nmbr_local_rcb(a, s, mid, nc, ndim, level + 1, c, bfr);
-  nmbr_local_rcb(a, mid, e, nc, ndim, level + 1, c, bfr);
-}
-
-// Number the DOFs internal first, faces second and all the rest (wire basket)
-// next. This keeps zeros as is and renumber the positive entries in `ids`
-// array.
-static void number_dual_graph_dofs(ulong *dofs, struct coarse *crs, uint n,
-                                   const slong *ids, uint nelt, unsigned ndim,
-                                   const scalar *coord, buffer *bfr) {
-  int nnz = (n > 0);
-  struct comm c;
-  comm_split(&crs->c, nnz, crs->c.id, &c);
-
-  unsigned nc = n / nelt;
-  uint i, j;
-  if (nnz) {
-    sint *dof = tcalloc(sint, n);
-    int level = 1;
-    while (c.np > 1) {
-      struct gs_data *gsh = gs_setup(ids, n, &c, 0, gs_pairwise, 0);
-
-      int bin = (c.id >= (c.np + 1) / 2);
-      for (i = 0; i < n; i++)
-        dof[i] = bin;
-
-      gs(dof, gs_int, gs_add, 0, gsh, bfr);
-
-      if (bin == 1) {
-        for (i = 0; i < n; i++)
-          dof[i] = 0;
-      }
-
-      gs(dof, gs_int, gs_add, 0, gsh, bfr);
-
-      for (i = 0; i < nelt; i++) {
-        for (j = 0; j < nc; j++) {
-          if (dof[i * nc + j] > 0 && !dofs[i]) {
-            dofs[i] = level;
-            break;
-          }
-        }
-      }
-
-      gs_free(gsh);
-
-      struct comm t;
-      comm_split(&c, bin, c.id, &t);
-      comm_free(&c);
-      comm_dup(&c, &t);
-      comm_free(&t);
-
-      level++;
-    }
-    free(dof);
-  }
-
-  for (i = crs->n[0] = crs->n[1] = 0; i < nelt; i++) {
-    if (dofs[i] > 0)
-      crs->n[1]++;
-    else
-      crs->n[0]++;
-  }
-
-  slong in[2] = {crs->n[0], crs->n[1]}, out[2][2], wrk[2][2];
-  comm_scan(out, &crs->c, gs_long, gs_add, in, 2, wrk);
-  crs->s[0] = out[0][0] + 1, crs->ng[0] = out[1][0];
-  crs->s[1] = out[0][1] + 1, crs->ng[1] = out[1][1];
-
-  struct array local;
-  array_init(struct rcb_t, &local, crs->n[0]);
-
-  struct rcb_t t = {.s = INT_MAX};
-  ulong s = crs->ng[0] + crs->s[1];
-  for (uint i = 0; i < nelt; i++) {
-    if (dofs[i] > 0)
-      dofs[i] = s++;
-    else {
-      t.i = i;
-      memcpy(t.coord, &coord[i * ndim], ndim * sizeof(scalar));
-      memcpy(t.vtx, &ids[i * nc], nc * sizeof(slong));
-      array_cat(struct rcb_t, &local, &t, 1);
-    }
-  }
-
-  if (local.n > 0) {
-    nmbr_local_rcb(&local, 0, local.n, nc, ndim, 1, &c, bfr);
-    sarray_sort(struct rcb_t, local.ptr, local.n, s, 0, bfr);
-    struct rcb_t *pl = (struct rcb_t *)local.ptr;
-    ulong s = crs->s[0];
-    for (sint i = local.n - 1; i >= 0; i--)
-      dofs[pl[i].i] = s++;
-  }
-
-  comm_free(&c);
-  array_free(&local);
-}
-
-struct coarse *coarse_setup(unsigned n, unsigned nc, const long long *vl,
-                            const scalar *coord, unsigned null_space,
-                            unsigned type, struct comm *c) {
-  comm_barrier(c);
-  double tcrs = comm_time();
-
-  // crs->un is the user vector size.
-  struct coarse *crs = tcalloc(struct coarse, 1);
-  crs->null_space = null_space, crs->type = type, crs->un = n;
-  for (unsigned i = 0; i < 3; i++)
-    crs->ng[i] = crs->s[i] = crs->n[i] = 0;
-
-  // Setup the buffer and duplicate the communicator.
-  buffer_init(&crs->bfr, 1024);
-  comm_dup(&crs->c, c);
-
-  uint size = n * nc;
-  slong *tid = tcalloc(slong, size);
-  for (uint i = 0; i < size; i++)
-    tid[i] = vl[i];
-
-  ulong *nid = tcalloc(ulong, n);
-  unsigned ndim = (nc == 8) ? 3 : 2;
-  number_dual_graph_dofs(nid, crs, size, tid, crs->un, ndim, coord, &crs->bfr);
-
-  // Find unique ids and user vector to compressed vector mapping.
-  // In the case of dual-graph Laplacian, all the ids are unique.
-  // But here we arrange them in the sorted order.
-  ulong *uid = tcalloc(ulong, n);
-  crs->u2c = tcalloc(sint, n);
-  crs->cn = unique_ids(crs->u2c, uid, crs->un, nid, &crs->bfr);
-  crs->an = crs->cn;
-
-  struct crystal cr;
-  crystal_init(&cr, &crs->c);
-
-  struct array nbrs, eij;
-  find_nbrs(&nbrs, nid, tid, n, nc, &cr, &crs->bfr);
-  // Convert `struct nbr` -> `struct mij` and compress entries which share the
-  // same (r, c) values. Set the diagonal element to have zero row sum
-  compress_nbrs(&eij, &nbrs, &crs->bfr);
-  array_free(&nbrs);
-
-  switch (type) {
-  case 0:
-    schur_setup(crs, &eij, &cr, &crs->bfr);
-    break;
-  default:
-    break;
-  }
-
-  array_free(&eij), crystal_free(&cr);
-  free(tid), free(nid), free(uid);
-
-  return crs;
-}
-
-void coarse_solve(scalar *x, struct coarse *crs, scalar *b, scalar tol) {
-  metric_init();
-
-  scalar *rhs = tcalloc(scalar, 2 * crs->an), *xx = rhs + crs->an;
-  for (uint i = 0; i < crs->un; i++) {
-    if (crs->u2c[i] >= 0)
-      rhs[crs->u2c[i]] += b[i];
-  }
-
-  switch (crs->type) {
-  case 0:
-    schur_solve(xx, crs, rhs, tol, &crs->bfr);
-    break;
-  default:
-    break;
-  }
-
-  for (uint i = 0; i < crs->un; i++) {
-    if (crs->u2c[i] >= 0)
-      x[i] = xx[crs->u2c[i]];
-  }
-  free(rhs);
-
-  metric_push_level();
-  metric_crs_print(&crs->c, 1);
-  metric_finalize();
-}
-
-void coarse_free(struct coarse *crs) {
-  if (crs != NULL) {
-    switch (crs->type) {
-    case 0:
-      schur_free(crs);
-      break;
-    default:
-      break;
-    }
-    if (crs->u2c)
-      free(crs->u2c);
-    comm_free(&crs->c), buffer_free(&crs->bfr);
-    free(crs), crs = NULL;
-  }
-}
diff --git a/src/coarse.c b/src/coarse.c
deleted file mode 100644
index 08ee3155..00000000
--- a/src/coarse.c
+++ /dev/null
@@ -1,394 +0,0 @@
-#include "coarse-impl.h"
-#include "metrics.h"
-#include "sort.h"
-
-//------------------------------------------------------------------------------
-// Better API for coarse grid system.
-//
-uint unique_ids(sint *perm, ulong *uid, uint n, const ulong *ids, buffer *bfr) {
-  struct id_t {
-    ulong id;
-    uint idx;
-    sint perm;
-  };
-
-  struct array arr;
-  array_init(struct id_t, &arr, n);
-
-  uint i;
-  struct id_t t = {.id = 0, .idx = 0, .perm = -1};
-  for (i = 0; i < n; i++) {
-    t.id = ids[i], t.idx = i;
-    array_cat(struct id_t, &arr, &t, 1);
-  }
-
-  sarray_sort(struct id_t, arr.ptr, arr.n, id, 1, bfr);
-  struct id_t *pa = (struct id_t *)arr.ptr;
-
-  // Ignore the ids numbered zero
-  sint un = 0;
-  ulong last = 0;
-  for (uint i = 0; i < arr.n; i++) {
-    ulong v = pa[i].id;
-    if (v != last)
-      last = uid[un] = v, un++;
-    pa[i].perm = un - 1;
-  }
-
-  sarray_sort(struct id_t, pa, n, idx, 0, bfr);
-  pa = (struct id_t *)arr.ptr;
-  for (i = 0; i < n; i++)
-    perm[i] = pa[i].perm;
-
-  array_free(&arr);
-  return un;
-}
-
-// Number rows, local first then interface. Returns global number of local
-// elements.
-struct rsb_t {
-  uint i, s;
-  slong vtx[8];
-};
-
-static void number_dofs(slong *nid, struct coarse *crs, const slong *ids,
-                        const ulong *uid) {
-  uint un = crs->un;
-  buffer *bfr = &crs->bfr;
-  struct comm *ci = &crs->c;
-  sint *u2c = crs->u2c;
-
-  int nnz = (un > 0);
-  struct comm c;
-  comm_split(ci, nnz, ci->id, &c);
-
-  uint i, j;
-  if (nnz) {
-    sint *dof = tcalloc(sint, un);
-    int level = 1;
-    while (c.np > 1) {
-      struct gs_data *gsh = gs_setup(ids, un, &c, 0, gs_pairwise, 0);
-
-      int bin = (c.id >= (c.np + 1) / 2);
-      for (i = 0; i < un; i++)
-        dof[i] = bin;
-
-      gs(dof, gs_int, gs_add, 0, gsh, bfr);
-
-      if (bin == 1) {
-        for (i = 0; i < un; i++)
-          dof[i] = 0;
-      }
-
-      gs(dof, gs_int, gs_add, 0, gsh, bfr);
-
-      for (i = 0; i < un; i++) {
-        if (dof[i] > 0 && u2c[i] >= 0 && !nid[u2c[i]])
-          nid[u2c[i]] = level;
-      }
-
-      gs_free(gsh);
-
-      struct comm t;
-      comm_split(&c, bin, c.id, &t);
-      comm_free(&c);
-      comm_dup(&c, &t);
-      comm_free(&t);
-
-      level++;
-    }
-    free(dof);
-  }
-
-  // Calculate unqiue local and interface nodes based on compress ids.
-  // Finding unique local ids is easy. To find unique interface ids, we
-  // will have to sort in parallel and then manually find the unique ids.
-  struct dof_t {
-    ulong id, nid;
-    uint p, p0, idx;
-  };
-
-  struct array arr;
-  array_init(struct dof_t, &arr, crs->cn);
-
-  uint ln = 0;
-  struct dof_t t = {.id = 0, .nid = 0, .p = 0, .p0 = ci->id, .idx = 0};
-  for (i = 0; i < crs->cn; i++) {
-    if (!nid[i])
-      ln++;
-    else
-      t.id = uid[i], t.idx = i, array_cat(struct dof_t, &arr, &t, 1);
-  }
-  crs->n[0] = ln;
-
-  slong cnt[1] = {ln}, out[2][1], wrk[2][1];
-  comm_scan(out, ci, gs_long, gs_add, cnt, 1, wrk);
-  crs->s[0] = out[0][0] + 1, crs->ng[0] = out[1][0];
-
-  for (i = 0, ln = 0; i < crs->cn; i++) {
-    if (!nid[i])
-      nid[i] = crs->s[0] + ln, ln++;
-  }
-  assert(crs->n[0] == ln);
-
-  // parallel_sort and set nid and send back to p0
-  parallel_sort(struct dof_t, &arr, id, gs_long, 0, 0, ci, bfr);
-
-  uint in = 0;
-  if (arr.n > 0) {
-    struct dof_t *pa = (struct dof_t *)arr.ptr;
-    for (i = in = 1; i < arr.n; i++)
-      in += (pa[i].id != pa[i - 1].id);
-  }
-
-  cnt[0] = in;
-  comm_scan(out, ci, gs_long, gs_add, cnt, 1, wrk);
-  crs->ng[1] = out[1][0];
-  slong s = crs->ng[0] + out[0][0] + 1;
-
-  if (in) {
-    struct dof_t *pa = (struct dof_t *)arr.ptr;
-    i = 0;
-    while (i < arr.n) {
-      for (j = i + 1; j < arr.n && pa[j].id == pa[i].id; j++)
-        ;
-      for (; i < j; i++)
-        pa[i].nid = s;
-      s++;
-    }
-  }
-
-  struct crystal cr;
-  crystal_init(&cr, ci);
-  sarray_transfer(struct dof_t, &arr, p0, 0, &cr);
-  crystal_free(&cr);
-
-  sarray_sort(struct dof_t, arr.ptr, arr.n, id, 1, bfr);
-  struct dof_t *pa = (struct dof_t *)arr.ptr;
-  for (i = 0; i < arr.n; i++)
-    nid[pa[i].idx] = pa[i].nid;
-
-  array_free(&arr);
-  comm_free(&c);
-}
-
-// n  = ncr * nelt
-// nz = ncr * ncr * nelt
-struct coarse *crs_parrsb_setup(uint n, const ulong *id, uint nz,
-                                const uint *Ai, const uint *Aj, const scalar *A,
-                                unsigned null_space, unsigned type,
-                                const struct comm *c) {
-  comm_barrier(c);
-  double tcrs = comm_time();
-
-  // crs->un is the user vector size.
-  struct coarse *crs = tcalloc(struct coarse, 1);
-  crs->null_space = null_space, crs->type = type, crs->un = n;
-  for (unsigned i = 0; i < 3; i++)
-    crs->ng[i] = crs->s[i] = crs->n[i] = 0;
-
-  // Setup the buffer and duplicate the communicator.
-  buffer_init(&crs->bfr, 1024);
-  comm_dup(&crs->c, c);
-
-  // Let's renumber the ids just in case its the schur solver. Schwarz solver
-  // doesn't need re-numbering but we are going to go ahead and do it.
-  slong *tid = tcalloc(slong, crs->un);
-  for (uint i = 0; i < n; i++)
-    tid[i] = id[i];
-
-  // Find the mapping from user ids to unique ids (compressed ids) local to the
-  // processor. Compressed vector size is `crs->cn`.
-  ulong *uid = tcalloc(ulong, crs->un);
-  crs->u2c = tcalloc(sint, crs->un);
-  crs->cn = unique_ids(crs->u2c, uid, crs->un, tid, &crs->bfr);
-#if 0
-  for (uint i = 0; i < crs->un; i++) {
-    printf("p = %d i = %u perm[i] = %d\n", c->id, i, crs->u2c[i]);
-    fflush(stdout);
-  }
-#endif
-
-  // Now renumber unique ids based on whether they are internal or on interface.
-  slong *nid = tcalloc(slong, crs->cn);
-  number_dofs(nid, crs, tid, uid);
-  free(tid), free(uid);
-
-  // Now let's setup the coarse system. Create `struct mij` entries and pass
-  // them into schur setup. Which processor owns the dof? All the local dofs
-  // are owned by those specific preocessors -- interface dofs are owned in
-  // a load balanced manner.
-  uint nr = crs->ng[1] / c->np, nrem = crs->ng[1] - nr * c->np;
-  uint p0 = c->np - nrem;
-  ulong s0 = p0 * nr;
-
-  struct array mijs;
-  array_init(struct mij, &mijs, n);
-
-  struct mij m = {.r = 0, .c = 0, .idx = 0, .p = 0, .v = 0};
-  for (uint k = 0; k < nz; k++) {
-    sint i = crs->u2c[Ai[k]], j = crs->u2c[Aj[k]];
-    if (i < 0 || j < 0 || A[k] == 0)
-      continue;
-    m.r = nid[i], m.c = nid[j], m.v = A[k], m.p = c->id;
-    if (m.r > crs->ng[0]) {
-      if (m.r - crs->ng[0] <= s0)
-        m.p = (m.r - crs->ng[0] - 1) / nr;
-      else
-        m.p = p0 + (m.r - crs->ng[0] - s0 - 1) / (nr + 1);
-    }
-    array_cat(struct mij, &mijs, &m, 1);
-  }
-
-  // Now let's assemble the matrix by sending load balancing the interface rows.
-  // Assembled size is `an`.
-  struct crystal cr;
-  crystal_init(&cr, c);
-  sarray_transfer(struct mij, &mijs, p, 1, &cr);
-
-  nid = trealloc(slong, nid, crs->cn + crs->n[0] + nr + 1);
-  for (uint i = 0; i < crs->cn; i++)
-    nid[i] = -nid[i];
-
-  crs->an = 0;
-  if (mijs.n > 0) {
-    sarray_sort_2(struct mij, mijs.ptr, mijs.n, r, 1, c, 1, &crs->bfr);
-    struct mij *pm = (struct mij *)mijs.ptr;
-    uint i = 0, j;
-    while (i < mijs.n) {
-      for (j = i + 1; j < mijs.n && pm[j].r == pm[i].r; j++)
-        ;
-      nid[crs->cn + crs->an] = pm[i].r, crs->an++, i = j;
-    }
-  }
-  crs->n[1] = crs->an - crs->n[0];
-  crs->s[1] = nid[crs->cn + crs->n[0]];
-  crs->c2a = gs_setup(nid, crs->cn + crs->an, c, 0, gs_pairwise, 0);
-
-  tcrs = comm_time() - tcrs;
-  double wrk, min = tcrs, max = tcrs;
-  comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk);
-  comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk);
-  if (c->id == 0) {
-    printf("parrsb_crs_setup: %g %g (min max)\n", min, max);
-    fflush(stdout);
-  }
-
-  comm_barrier(c);
-  tcrs = comm_time();
-
-  switch (type) {
-  case 0:
-    schur_setup(crs, &mijs, &cr, &crs->bfr);
-    break;
-  default:
-    break;
-  }
-
-  min = max = comm_time() - tcrs;
-  comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk);
-  comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk);
-  if (c->id == 0) {
-    printf("schur_setup: %g %g (min max)\n", min, max);
-    fflush(stdout);
-  }
-
-  array_free(&mijs), crystal_free(&cr);
-
-  return crs;
-}
-
-void crs_parrsb_solve(scalar *x, struct coarse *crs, scalar *b, scalar tol) {
-  metric_init();
-
-  scalar *rhs = tcalloc(scalar, crs->cn + crs->an);
-  for (uint i = 0; i < crs->un; i++) {
-    if (crs->u2c[i] >= 0)
-      rhs[crs->u2c[i]] += b[i];
-  }
-
-#if 0
-  for (uint i = 0; i < crs->cn; i++) {
-    printf("p = %d i = %u before b[i] = %lf\n", crs->c.id, i, rhs[i]);
-    fflush(stdout);
-  }
-#endif
-
-  gs(rhs, gs_double, gs_add, 1, crs->c2a, &crs->bfr);
-
-#if 0
-  char name[BUFSIZ];
-  snprintf(name, BUFSIZ, "rsb_b_np_%d_id_%d_nl_%lld_ni_%lld.txt", crs->c.np,
-           crs->c.id, crs->n[0], crs->n[1]);
-  FILE *fp = fopen(name, "w");
-  if (fp) {
-    for (uint i = 0; i < crs->an; i++)
-      fprintf(fp, "%lf\n", rhs[crs->cn + i]);
-    fclose(fp);
-  }
-#endif
-
-#if 0
-  for (uint i = 0; i < crs->an; i++) {
-    printf("p = %d i = %u after b[i] = %lf\n", crs->c.id, i, rhs[crs->cn + i]);
-    fflush(stdout);
-  }
-#endif
-
-  switch (crs->type) {
-  case 0:
-    schur_solve(rhs + crs->cn, crs, rhs + crs->cn, tol, &crs->bfr);
-    break;
-  default:
-    break;
-  }
-
-#if 0
-  for (uint i = 0; i < crs->an; i++) {
-    printf("p = %d i = %u x[i] = %lf w[i] = %lf\n", crs->c.id, i,
-           rhs[crs->cn + i], weights[crs->cn + i]);
-    fflush(stdout);
-  }
-#endif
-
-  gs(rhs, gs_double, gs_add, 0, crs->c2a, &crs->bfr);
-  for (uint i = 0; i < crs->un; i++) {
-    if (crs->u2c[i] >= 0)
-      x[i] = rhs[crs->u2c[i]];
-    else
-      x[i] = 0;
-  }
-  free(rhs);
-
-#if 0
-  snprintf(name, BUFSIZ, "rsb_x_np_%d_id_%d_un_%u.txt", crs->c.np, crs->c.id,
-           crs->un);
-  fp = fopen(name, "w");
-  if (fp) {
-    for (uint i = 0; i < crs->un; i++)
-      fprintf(fp, "%lf\n", x[i]);
-    fclose(fp);
-  }
-#endif
-
-  metric_push_level();
-  metric_crs_print(&crs->c, 1);
-  metric_finalize();
-}
-
-void crs_parrsb_free(struct coarse *crs) {
-  if (crs != NULL) {
-    switch (crs->type) {
-    case 0:
-      schur_free(crs);
-      break;
-    default:
-      break;
-    }
-    if (crs->u2c)
-      free(crs->u2c);
-    gs_free(crs->c2a);
-    comm_free(&crs->c), buffer_free(&crs->bfr);
-    free(crs), crs = NULL;
-  }
-}
diff --git a/src/coarse.h b/src/coarse.h
deleted file mode 100644
index 49698985..00000000
--- a/src/coarse.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _PARRSB_COARSE_H_
-#define _PARRSB_COARSE_H_
-
-#include "gslib.h"
-#include "mat.h"
-
-struct coarse;
-
-// API for the Laplacian (which involves solving for the dual graph)
-struct coarse *coarse_setup(unsigned nelt, unsigned nv, const long long *vtx,
-                            const scalar *coord, unsigned null_space,
-                            unsigned type, struct comm *c);
-void coarse_solve(scalar *x, struct coarse *crs, scalar *b, scalar tol);
-void coarse_free(struct coarse *crs);
-
-// Alternative API for a general matrix
-#define crs_parrsb_setup PREFIXED_NAME(crs_parrsb_setup)
-#define crs_parrsb_solve PREFIXED_NAME(crs_parrsb_solve)
-#define crs_parrsb_free PREFIXED_NAME(crs_parrsb_free)
-
-struct coarse *crs_parrsb_setup(uint n, const ulong *id, uint nz,
-                                const uint *Ai, const uint *Aj, const scalar *A,
-                                unsigned null_space, unsigned type,
-                                const struct comm *comm);
-void crs_parrsb_solve(scalar *x, struct coarse *crs, scalar *b, scalar tol);
-void crs_parrsb_free(struct coarse *crs);
-
-#endif
diff --git a/src/components.c b/src/components.c
index 28eda2f8..3efb84fb 100644
--- a/src/components.c
+++ b/src/components.c
@@ -36,7 +36,8 @@ uint get_components(sint *component, struct array *elems, unsigned nv,
 
   struct comm cc;
   uint count = 0;
-  slong nnz1, nnzg, nnzg0, nnzb, nmarked = 0;
+  slong nnz1, nnzg, nnzg0, nnzb;
+  ulong nmarked = 0;
   do {
     // Count unmarked elements
     arr.n = 0;
@@ -120,13 +121,13 @@ struct cmp_t {
 };
 
 static sint find_or_insert(struct array *cids, struct cmp_t *t) {
-  // If there are no elements in the array, insert and exit
+  // If there are no elements in the array, insert and exit.
   if (cids->n == 0) {
     array_cat(struct cmp_t, cids, t, 1);
     return -1;
   }
 
-  // Otherwise, we will do a binary search
+  // Otherwise, we will do a binary search.
   struct cmp_t *pc = (struct cmp_t *)cids->ptr;
   sint s = 0, e = cids->n - 1, mid = 0;
   while (s <= e) {
@@ -139,7 +140,7 @@ static sint find_or_insert(struct array *cids, struct cmp_t *t) {
       s = mid + 1;
   }
 
-  // Okay, not found -- insert at `mid` or `mid + 1`
+  // Okay, not found -- insert at `mid` or `mid + 1`.
   uint max = cids->max;
   if (max == cids->n) {
     max += max / 2 + 1;
@@ -158,7 +159,7 @@ static sint find_or_insert(struct array *cids, struct cmp_t *t) {
   }
   pc[n] = t0, cids->n++;
 
-  // Sanity check
+  // Sanity check.
   for (unsigned i = 1; i < cids->n; i++)
     assert(pc[i - 1].c < pc[i].c);
 
@@ -177,8 +178,9 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv,
   if (nelg == 0)
     return 0;
 
-  uint nev = nelt * nv;
-  sint *p0 = tcalloc(sint, 2 * nev), *p = p0 + nev;
+  const uint nev = nelt * nv;
+  sint *p0 = tcalloc(sint, nev);
+  sint *p = tcalloc(sint, nev);
   slong *ids = tcalloc(slong, nev);
   uint *inds = tcalloc(uint, nev);
 
@@ -190,9 +192,10 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv,
     component[e] = -1;
 
   struct comm c;
-  slong nmkd = 0, nc = 0;
+  ulong nmkd = 0;
+  slong nc = 0;
   do {
-    // Copy unmarked elements to ids
+    // Copy unmarked elements to ids.
     uint unmkd = 0;
     for (uint e = 0; e < nelt; e++) {
       if (component[e] == -1) {
@@ -206,28 +209,30 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv,
     int bin = (unmkd > 0);
     comm_split(ci, bin, ci->id, &c);
 
-    slong nnzg = 0, nnzg0 = 0, ncg = 0;
+    slong nnzg = 0, ncg = 0;
     if (bin == 1) {
-      // Setup gs
-      struct gs_data *gsh = gs_setup(ids, unmkd * nv, &c, 0, gs_pairwise, 0);
-
-      // Mark the first unmarked element as seed for the component c.id
+      // Mark the first unmarked element as seed for the component c.id.
       for (uint v = 0; v < nv; v++)
         p[0 * nv + v] = c.id;
 
-      // Initialize the rest of p
+      // Initialize the rest of p.
       for (uint e = 1; e < unmkd; e++)
         for (uint v = 0; v < nv; v++)
           p[e * nv + v] = -1;
 
-      sint nnz, changed;
+      // Setup gather-scatter to do BFS.
+      struct gs_data *gsh = gs_setup(ids, unmkd * nv, &c, 0, gs_pairwise, 0);
+
+      // Perform BFS.
+      sint changed;
       do {
         for (uint i = 0; i < unmkd * nv; i++)
           p0[i] = p[i];
 
         gs(p, gs_int, gs_max, 0, gsh, bfr);
 
-        nnz = changed = 0;
+        changed = 0;
+        sint nnz = 0;
         for (uint e = 0; e < unmkd; e++) {
           sint v0 = -1;
           for (uint v = 0; v < nv; v++) {
@@ -239,7 +244,8 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv,
             }
           }
 
-          // There was one non-zero vertex in the element
+          // If there was at least one non-zero vertex in the element, we mark
+          // the element with that value.
           if (v0 > -1) {
             sint c = p[e * nv + v0];
             for (uint v = 0; v < nv; v++)
@@ -247,6 +253,7 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv,
             nnz++;
           }
 
+          // Check if the component id changed.
           for (uint v = 0; v < nv; v++) {
             if (p[e * nv + v] != p0[e * nv + v]) {
               changed = 1;
@@ -255,14 +262,15 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv,
           }
         }
 
-        nnzg0 = nnzg, nnzg = nnz;
+        nnzg = nnz;
         comm_allreduce(&c, gs_long, gs_add, &nnzg, 1, wrk);
         comm_allreduce(&c, gs_int, gs_add, &changed, 1, wrk);
       } while (changed);
+
       gs_free(gsh);
 
       // Find unique local components and then use them to find unique
-      // global components
+      // global components.
       struct array cids;
       array_init(struct cmp_t, &cids, 100);
 
@@ -276,10 +284,13 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv,
 
       struct crystal cr;
       crystal_init(&cr, &c);
-      sarray_transfer(struct cmp_t, &cids, p, 1, &cr);
 
-      // find unique components and number them
+      // Send the component id `C` to `C % P` where `P` is the number of
+      // processors.
+      sarray_transfer(struct cmp_t, &cids, p, 1, &cr);
       sarray_sort(struct cmp_t, cids.ptr, cids.n, c, 0, bfr);
+
+      // Find unique components and number them globally.
       uint cnt = 0;
       if (cids.n > 0) {
         cnt++;
@@ -307,8 +318,9 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv,
 
       sarray_transfer(struct cmp_t, &cids, p, 0, &cr);
       crystal_free(&cr);
-
       sarray_sort(struct cmp_t, cids.ptr, cids.n, c, 0, bfr);
+
+      // Now assign the global component id to the marked elements.
       for (uint e = 0; e < unmkd; e++) {
         if (p[e * nv + 0] > -1) {
           t.c = p[e * nv + 0];
@@ -328,9 +340,9 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv,
     nc += ncg;
   } while (nmkd < nelg);
 
-  free(p0), free(ids), free(inds);
   if (null_input == 1)
     free(component);
+  free(p0), free(p), free(ids), free(inds);
 
   return nc;
 }
diff --git a/src/con-check.c b/src/con-check.c
index f0e57937..dfe9e86a 100644
--- a/src/con-check.c
+++ b/src/con-check.c
@@ -24,8 +24,8 @@ typedef struct {
 } ProcID;
 
 static VToEMap *getVToEMap(Mesh m, struct comm *c, buffer *bfr) {
-  sint nelt = m->nelt;
-  sint nv = m->nv;
+  uint nelt = m->nelt;
+  uint nv = m->nv;
 
   slong out[2][1], buf[2][1], in = nelt;
   comm_scan(out, c, gs_long, gs_add, &in, 1, buf);
@@ -38,7 +38,7 @@ static VToEMap *getVToEMap(Mesh m, struct comm *c, buffer *bfr) {
 
   // Create (globalId, elementId) pairs and send them to globalId % np
   Point ptr = m->elements.ptr;
-  sint i, j;
+  uint i, j;
   for (i = 0; i < nelt; i++) {
     for (j = 0; j < nv; j++) {
       ulong globalId = ptr[i * nv + j].globalId + 1;
@@ -90,7 +90,7 @@ static VToEMap *getVToEMap(Mesh m, struct comm *c, buffer *bfr) {
   array_init(ProcID, &procs, 10);
 
   vPtr = vtcsCmpct.ptr;
-  sint s = 0, e;
+  uint s = 0, e;
   vertex t;
   ProcID p;
   while (s < vtcsCmpct.n) {
@@ -168,12 +168,12 @@ static VToEMap *getVToEMap(Mesh m, struct comm *c, buffer *bfr) {
 }
 
 // key must be present in globalIds
-static int getPosition(VToEMap *map, ulong key) {
+static uint getPosition(VToEMap *map, ulong key) {
   ulong *globalIds = map->globalIds;
 
-  int begin = 0;
-  int end = map->size;
-  int mid = 0;
+  uint begin = 0;
+  uint end = map->size;
+  uint mid = 0;
   while (begin < end) {
     mid = (begin + end) / 2;
 
@@ -186,7 +186,7 @@ static int getPosition(VToEMap *map, ulong key) {
   };
 
   if (globalIds[mid] != key)
-    return -1;
+    return UINT_MAX;
   return mid;
 }
 
@@ -197,11 +197,11 @@ static void freeVToEMap(VToEMap *map) {
   free(map);
 }
 
-int faceCheck(Mesh mesh, struct comm *c, buffer *bfr) {
+int face_check(Mesh mesh, struct comm *c, buffer *bfr) {
   VToEMap *map = getVToEMap(mesh, c, bfr);
 
-  sint nelt = mesh->nelt;
-  sint ndim = mesh->ndim;
+  uint nelt = mesh->nelt;
+  uint ndim = mesh->ndim;
 
   int faces[GC_MAX_FACES][GC_MAX_FACE_VERTICES];
   if (ndim == 3)
@@ -210,24 +210,24 @@ int faceCheck(Mesh mesh, struct comm *c, buffer *bfr) {
     memcpy(faces, faces2D, GC_MAX_FACES * GC_MAX_FACE_VERTICES * sizeof(int));
 
   Point ptr = mesh->elements.ptr;
-  int nf = (ndim == 3) ? 6 : 4;
-  int nfv = (ndim == 3) ? 4 : 2;
-  int nv = (ndim == 3) ? 8 : 4;
+  uint nf = (ndim == 3) ? 6 : 4;
+  uint nfv = (ndim == 3) ? 4 : 2;
+  uint nv = (ndim == 3) ? 8 : 4;
 
   struct array shared;
   array_init(LongID, &shared, 200);
 
   int err = 0;
 
-  int i, j, k, l;
+  uint i, j, k, l;
   for (i = 0; i < nelt && err == 0; i++) {
     for (j = 0; j < nf && err == 0; j++) {
       shared.n = 0;
 
       for (k = 0; k < nfv; k++) {
         ulong globalId = ptr[i * nv + faces[j][k] - 1].globalId + 1;
-        int indx = getPosition(map, globalId);
-        assert(indx >= 0);
+        uint indx = getPosition(map, globalId);
+        assert(indx < UINT_MAX);
         LongID elemId;
         for (l = map->offsets[indx]; l < map->offsets[indx + 1]; l++) {
           elemId.id = map->elements[l];
@@ -265,10 +265,10 @@ int faceCheck(Mesh mesh, struct comm *c, buffer *bfr) {
   return err;
 }
 
-int elementCheck(Mesh mesh, struct comm *c, buffer *bfr) {
+int element_check(Mesh mesh, struct comm *c, buffer *bfr) {
   uint nelt = mesh->nelt;
   uint ndim = mesh->ndim;
-  int nv = (ndim == 3) ? 8 : 4;
+  uint nv = (ndim == 3) ? 8 : 4;
 
   LongID globalIds[8];
   Point ptr = mesh->elements.ptr;
diff --git a/src/con-impl.h b/src/con-impl.h
index 252acdd8..07bd8aa0 100644
--- a/src/con-impl.h
+++ b/src/con-impl.h
@@ -3,7 +3,6 @@
 
 #include "parrsb-impl.h"
 #include "sort.h"
-#include <stdarg.h>
 
 /*
  Preprocessor Corner notation:      Symmetric Corner notation:
@@ -126,10 +125,10 @@ int send_back(Mesh mesh, struct comm *c, buffer *bfr);
 int find_unique_vertices(Mesh mesh, struct comm *c, scalar tol, int verbose,
                          buffer *bfr);
 
-int matchPeriodicFaces(Mesh mesh, struct comm *c, buffer *bfr);
+int match_periodic_faces(Mesh mesh, struct comm *c, int verbose, buffer *bfr);
 
-int elementCheck(Mesh mesh, struct comm *c, buffer *bfr);
+int element_check(Mesh mesh, struct comm *c, buffer *bfr);
 
-int faceCheck(Mesh mesh, struct comm *c, buffer *bfr);
+int face_check(Mesh mesh, struct comm *c, buffer *bfr);
 
 #endif // _CON_IMPL_H_
diff --git a/src/con-periodic.c b/src/con-periodic.c
index e915d539..b0c61dc8 100644
--- a/src/con-periodic.c
+++ b/src/con-periodic.c
@@ -1,5 +1,7 @@
 #include "con-impl.h"
 
+#include <math.h>
+
 //==============================================================================
 // Handle periodic BCs
 //
@@ -26,9 +28,9 @@ static int compressPeriodicVertices(Mesh mesh, struct comm *c, buffer *bfr) {
   Point points = mesh->elements.ptr;
   uint npoints = mesh->elements.n;
 
-  sint i, nunique = 0;
+  uint i, nunique = 0;
   if (npoints > 0) {
-    slong current = points[0].globalId;
+    ulong current = points[0].globalId;
     points[0].globalId = nunique;
     for (i = 1; i < npoints; i++)
       if (points[i].globalId == current)
@@ -187,7 +189,8 @@ static int findConnectedPeriodicFaces(Mesh mesh, struct array *matched) {
 
   for (i = 0; i < bSize - 1; i++) {
     for (j = i + 1; j < bSize; j++)
-      if (ptr[j].bc[0] == ptr[i].elementId && ptr[j].bc[1] == ptr[i].faceId) {
+      if ((ulong)ptr[j].bc[0] == ptr[i].elementId &&
+          (ulong)ptr[j].bc[1] == ptr[i].faceId) {
         findConnectedPeriodicPairs(mesh, &ptr[i], &ptr[j], matched);
       }
   }
@@ -195,7 +198,7 @@ static int findConnectedPeriodicFaces(Mesh mesh, struct array *matched) {
 }
 
 static int gatherMatchingPeriodicFaces(Mesh mesh, struct comm *c) {
-  int size = c->np, rank = c->id;
+  uint size = c->np;
 
   BoundaryFace bPtr = mesh->boundary.ptr;
   int nFaces = mesh->boundary.n;
@@ -208,7 +211,7 @@ static int gatherMatchingPeriodicFaces(Mesh mesh, struct comm *c) {
   sint i;
   slong eid;
   for (i = 0; i < nFaces; i++) {
-    eid = MAX(bPtr[i].bc[0], bPtr[i].elementId);
+    eid = MAX((ulong)bPtr[i].bc[0], bPtr[i].elementId);
     if (eid < N)
       bPtr[i].proc = eid / nelt;
     else
@@ -263,19 +266,33 @@ static int setPeriodicFaceCoordinates(Mesh mesh, struct comm *c, buffer *buf) {
   return 0;
 }
 
-int matchPeriodicFaces(Mesh mesh, struct comm *c, buffer *bfr) {
+int match_periodic_faces(Mesh mesh, struct comm *c, int verbose, buffer *bfr) {
+  const char *functions[6] = {
+      "set_periodic_face_coords      ", "gather_matching_periodic_faces",
+      "find_connected_periodic_faces ", "renumber_periodic_vertices    ",
+      "compress_periodic_vertices    ", "send_back                     "};
+
+  parrsb_print(c, verbose, "\t\t%s ...", functions[0]);
   setPeriodicFaceCoordinates(mesh, c, bfr);
+
+  parrsb_print(c, verbose, "\t\t%s ...", functions[1]);
   gatherMatchingPeriodicFaces(mesh, c);
 
   struct array matched;
   array_init(struct mpair_t, &matched, 10);
   matched.n = 0;
 
+  parrsb_print(c, verbose, "\t\t%s ...", functions[2]);
   findConnectedPeriodicFaces(mesh, &matched);
+
+  parrsb_print(c, verbose, "\t\t%s ...", functions[3]);
   renumberPeriodicVertices(mesh, c, &matched, bfr);
   array_free(&matched);
 
+  parrsb_print(c, verbose, "\t\t%s ...", functions[4]);
   compressPeriodicVertices(mesh, c, bfr);
+
+  parrsb_print(c, verbose, "\t\t%s ...", functions[5]);
   send_back(mesh, c, bfr);
 
   return 0;
diff --git a/src/con-unique-vertices.c b/src/con-unique-vertices.c
index f610dc8b..b4564f8a 100644
--- a/src/con-unique-vertices.c
+++ b/src/con-unique-vertices.c
@@ -63,10 +63,12 @@ static void tuple_sort_(void *ra, uint n, uint usize, uint offset) {
   tuple_sort_((void *)arr, n, sizeof(T), offsetof(T, index))
 
 static void sort_segments_local(struct array *local, int dim) {
-  sint npts = local->n;
-  struct point_t *pts = (struct point_t *)local->ptr;
+  uint npts = local->n;
+  if (npts == 0)
+    return;
 
-  sint s = 0, e;
+  struct point_t *const pts = (struct point_t *const)local->ptr;
+  uint s = 0, e;
   while (s < npts) {
     for (e = s + 1; e < npts && pts[e].ifSegment == 0; e++)
       ;
@@ -99,7 +101,8 @@ static void sort_segments_local(struct array *local, int dim) {
 }
 
 static void sort_segments_shared_aux(struct array *arr, int dim, struct comm *c,
-                                     buffer *bfr) {
+                                     int verbose, buffer *bfr) {
+  parrsb_print(c, verbose, "\t\t\t\tsss_aux_parallel_sort: ...\n");
   switch (dim) {
   case 0:
     parallel_sort(struct point_t, arr, x[0], gs_double, 0, 1, c, bfr);
@@ -113,23 +116,85 @@ static void sort_segments_shared_aux(struct array *arr, int dim, struct comm *c,
   default:
     break;
   }
+  parrsb_print(c, verbose, "\t\t\t\tsss_aux_parallel_sort: done.\n");
 
   // Mark the first point of the segment to have ifSegment = 1 and zero out
   // everything else.
-  struct point_t *pts = (struct point_t *)arr->ptr;
+  struct point_t *const pts = (struct point_t *const)arr->ptr;
   for (uint i = 0; i < arr->n; i++)
     pts[i].ifSegment = 0;
 
+  sint wrk;
   sint rank = (arr->n > 0) ? c->id : c->np;
-  sint wrk[2];
-  comm_allreduce(c, gs_int, gs_min, &rank, 1, wrk);
+  comm_allreduce(c, gs_int, gs_min, &rank, 1, &wrk);
 
-  if (c->id == rank)
+  if ((sint)c->id == rank)
     pts[0].ifSegment = 1;
+
+  parrsb_print(c, verbose, "\t\t\t\tsss_aux_mark_first_point: done.");
+}
+
+static uint find_bin_scan(const sint sum, const struct comm *c,
+                          const int verbose, buffer *bfr) {
+  sint out[2][1], wrk[2][1], in = sum;
+  comm_scan(out, c, gs_int, gs_add, &in, 1, wrk);
+  return out[0][0];
+}
+
+static uint find_bin_gs(const slong id, const struct comm *c, const int verbose,
+                        buffer *bfr) {
+  slong gid = id + 1;
+  struct gs_data *gsh = gs_setup(&gid, 1, c, 0, gs_crystal_router, verbose);
+  parrsb_print(c, verbose, "\t\t\tsss_gs_setup: done.");
+  sint bin = c->id;
+  gs(&bin, gs_int, gs_min, 0, gsh, bfr);
+  gs_free(gsh);
+
+  return bin;
+}
+
+static uint find_bin_cr(const slong id, const struct comm *c, const int verbose,
+                        buffer *bfr) {
+  struct gid_t {
+    ulong id;
+    uint proc, procm;
+  };
+
+  struct array arr;
+  array_init(struct gid_t, &arr, 1);
+
+  struct gid_t gid = {.id = id, .proc = id % c->np, .procm = c->id};
+  array_cat(struct gid_t, &arr, &gid, 1);
+
+  struct crystal cr;
+  crystal_init(&cr, c);
+
+  sarray_transfer(struct gid_t, &arr, proc, 1, &cr);
+  if (arr.n > 0) {
+    sarray_sort_2(struct gid_t, arr.ptr, arr.n, id, 1, procm, 0, bfr);
+    struct gid_t *pa = (struct gid_t *)arr.ptr;
+    uint s = 0;
+    while (s < arr.n) {
+      uint e = s + 1;
+      for (; e < arr.n && pa[s].id == pa[e].id; e++)
+        pa[e].procm = pa[s].procm;
+      s = e;
+    }
+  }
+  sarray_transfer(struct gid_t, &arr, proc, 0, &cr);
+
+  crystal_free(&cr);
+
+  assert(arr.n == 1);
+  struct gid_t *pa = (struct gid_t *)arr.ptr;
+  uint procm = pa[0].procm;
+  array_free(&arr);
+
+  return procm;
 }
 
 static void sort_segments_shared(struct array *shared, int dim, struct comm *c,
-                                 buffer *bfr) {
+                                 int verbose, buffer *bfr) {
   // Each process can only have at most a single ifSegment = 1 in shared
   // array. Otherwise, we can always move the segments into the local segments
   // array till we end up in such a configuration. Let's first check for this
@@ -157,13 +222,24 @@ static void sort_segments_shared(struct array *shared, int dim, struct comm *c,
     }
   }
   assert(sum <= 1);
-  assert(ngids <= 1 || (ngids == 2 && gids[0] + 1 == gids[1]));
+  assert(ngids <= 1 || (ngids == 2 && gids[1] == gids[0] + 1));
+  parrsb_print(c, verbose, "\t\t\tsss_local: done.");
+
+  // Algorithm to be used for finding the bin id for segmented shared sort.
+  // Default (algo = 0) is the scan. algo = 1 is gs with gs_crystal_router.
+  // algo = 2 is a custom crystal router implementation.
+  int algo = 0;
+  char *val = getenv("PARRSB_FIND_BIN_ALGO");
+  if (val)
+    algo = atoi(val);
+  assert(algo >= 0 && algo <= 2);
 
   // We sort the shared segments in two phases. All the segments having an even
   // global id are sorted first and then the segments having an odd global id
   // are sorted. This is done to avoid same process having to work on both the
   // global ids (if ngids = 2) it owns at the same time.
   for (int parity = 0; parity < 2; parity++) {
+    parrsb_print(c, verbose, "\t\t\tsss_parity_%d: ...", parity);
     int index = INT_MIN;
     if (gids[0] >= 0 && (gids[0] % 2 == parity))
       index = 0;
@@ -173,21 +249,30 @@ static void sort_segments_shared(struct array *shared, int dim, struct comm *c,
     struct comm active, seg;
     comm_split(c, index >= 0, c->id, &active);
     if (index >= 0) {
-      // Setup a gs handle to find the minimum rank with the current global id
-      // and use that rank as the bin for the comm_split.
-      slong id = gids[index] + 1;
-      struct gs_data *gsh = gs_setup(&id, 1, &active, 0, gs_pairwise, 0);
-      sint bin = active.id;
-      gs(&bin, gs_int, gs_min, 0, gsh, bfr);
-      gs_free(gsh);
+      assert(gids[index] >= 0);
+      sint bin = -1;
+      if (algo == 0) {
+        uint off = (ngids == 1 && sum == 1) || (ngids == 2 && index == 1);
+        bin = find_bin_scan(sum, &active, verbose - 1, bfr) + off;
+      } else if (algo == 1) {
+        bin = find_bin_gs(gids[index], &active, verbose - 1, bfr);
+      } else if (algo == 2) {
+        bin = find_bin_cr(gids[index], &active, verbose - 1, bfr);
+      }
+      parrsb_print(&active, verbose,
+                   "\t\t\tsss_find_bin_algo_%d_parity_%d: done.", algo, parity);
+      assert(bin >= 0 && bin <= (sint)active.np);
 
       // index >= 0 --> gids[index] >= 0 --> segments[index].n > 0
       comm_split(&active, bin, active.id, &seg);
-      sort_segments_shared_aux(&segments[index], dim, &seg, bfr);
+      sort_segments_shared_aux(&segments[index], dim, &seg, verbose - 1, bfr);
       comm_free(&seg);
+      parrsb_print(&active, verbose, "\t\t\tsss_aux_%d: done.", parity);
     }
     comm_free(&active);
+    parrsb_print(c, verbose, "\t\t\tsss_parity_%d: done.", parity);
   }
+  parrsb_print(c, verbose, "\t\t\tsss_shared: done.");
 
   // Combine the segments after sorting.
   shared->n = 0;
@@ -216,7 +301,7 @@ static int talk_to_neighbor(struct point_t *pnt, const struct array *arr,
 
   struct point_t *pts = (struct point_t *)arr->ptr;
   sint dest = (sint)c->id + dir;
-  if (dest >= 0 && dest < c->np) {
+  if (dest >= 0 && dest < (sint)c->np) {
     struct point_t p = (dir == 1) ? pts[arr->n - 1] : pts[0];
     p.proc = dest;
     array_cat(struct point_t, &tmp, &p, 1);
@@ -294,8 +379,8 @@ static void separate_local_segments(struct array *local, struct array *shared,
     s = e;
   }
 
-  sint check = lcheck, wrk[2];
-  comm_allreduce(c, gs_int, gs_add, &check, 1, wrk);
+  sint check = lcheck, wrk;
+  comm_allreduce(c, gs_int, gs_add, &check, 1, &wrk);
   if (check) {
     // Bring the first point from next process. Check if `ifSegment` value
     // of that point is a 1 or a 0. If it is a 1, add the current range to
@@ -357,9 +442,9 @@ static slong number_segments(struct array *local, struct array *shared,
   return st + lt;
 }
 
-static int number_points(struct array *elems, const struct array *local,
-                         const struct array *shared, const struct comm *c,
-                         buffer *bfr) {
+static void number_points(struct array *elems, const struct array *local,
+                          const struct array *shared, const struct comm *c,
+                          buffer *bfr) {
   // First number local points and then number shared points.
   slong out[2][1], wrk[2][1], in = local->n;
   comm_scan(out, c, gs_long, gs_add, &in, 1, wrk);
@@ -401,8 +486,8 @@ int find_unique_vertices(Mesh mesh, struct comm *c, scalar tol, int verbose,
   for (uint i = 0; i < elems->n; i++)
     pts[i].ifSegment = pts[i].globalId = 0;
 
-  slong npts = elems->n, wrk[2];
-  comm_allreduce(c, gs_long, gs_add, &npts, 1, wrk);
+  slong npts = elems->n, wrk;
+  comm_allreduce(c, gs_long, gs_add, &npts, 1, &wrk);
 
   // Initialize shared and local arrays and then copy all points in `elems`
   // array to shared array first. Shared array contains only the segments which
@@ -417,29 +502,33 @@ int find_unique_vertices(Mesh mesh, struct comm *c, scalar tol, int verbose,
 
   for (int t = 0; t < ndim; t++) {
     for (int d = 0; d < ndim; d++) {
-      debug_print(c, verbose, "\t\tlocglob: %d %d", t + 1, d + 1);
-
       // Sort both local and shared segments.
-      sort_segments_shared(&shared, d, c, bfr);
+      parrsb_print(c, verbose - 1, "\t\tsort_shared_segments ...");
+      sort_segments_shared(&shared, d, c, verbose - 1, bfr);
+      parrsb_print(c, verbose - 1, "\t\tsort_local_segments ...");
       sort_segments_local(&local, d);
 
       // Find segments in local and shared segments now.
+      parrsb_print(c, verbose - 1, "\t\tfind_shared_segments ...");
       find_segments(&shared, d, tol2, c);
+      parrsb_print(c, verbose - 1, "\t\tfind_local_segments ...");
       find_segments(&local, d, tol2, &COMM_NULL);
 
       // Separate local segments from the shared segments.
+      parrsb_print(c, verbose - 1, "\t\tseparate_local_segments ...");
       separate_local_segments(&local, &shared, c);
 
       // Number the segments.
+      parrsb_print(c, verbose - 1, "\t\tnumber_segments ...");
       slong nseg = number_segments(&local, &shared, c);
-      debug_print(c, verbose, " %lld %lld\n", nseg, npts);
+      parrsb_print(c, verbose, "\tlocglob: %d %d %lld %lld", t + 1, d + 1, nseg,
+                   npts);
     }
   }
   // Number points consecutively -- shared points after local and then load
   // balance.
-  debug_print(c, verbose, "\t\tnumber points and load balance ...");
+  parrsb_print(c, verbose - 1, "\tnumber_points_and_load_balance ...");
   number_points(elems, &local, &shared, c, bfr);
-  debug_print(c, verbose, "done.\n");
   array_free(&shared), array_free(&local);
 
   return 0;
diff --git a/src/con.c b/src/con.c
index 60e502c9..c34a918b 100644
--- a/src/con.c
+++ b/src/con.c
@@ -1,7 +1,4 @@
 #include "con-impl.h"
-#include "parrsb-impl.h"
-#include "sort.h"
-#include <stdarg.h>
 
 int PRE_TO_SYM_VERTEX[GC_MAX_VERTICES] = {0, 1, 3, 2, 4, 5, 7, 6};
 int PRE_TO_SYM_FACE[GC_MAX_FACES] = {2, 1, 3, 0, 4, 5};
@@ -9,24 +6,13 @@ int NEIGHBOR_MAP[GC_MAX_VERTICES][GC_MAX_NEIGHBORS] = {
     {1, 2, 4}, {0, 3, 5}, {0, 3, 6}, {1, 2, 7},
     {0, 5, 6}, {1, 4, 7}, {2, 4, 7}, {3, 5, 6}};
 
-void debug_print(struct comm *c, int verbose, const char *fmt, ...) {
-  comm_barrier(c);
-  va_list vargs;
-  va_start(vargs, fmt);
-  if (c->id == 0 && verbose > 0) {
-    vprintf(fmt, vargs);
-    fflush(stdout);
-  }
-  va_end(vargs);
-}
-
 double diff_sqr(double x, double y) { return (x - y) * (x - y); }
 
 //==============================================================================
 // Mesh struct
 //
-static struct mesh_t *mesh_init(int nelt, int ndim, double *coord,
-                                long long *pinfo, int npinfo,
+static struct mesh_t *mesh_init(uint nelt, unsigned ndim, double *coord,
+                                long long *pinfo, uint npinfo,
                                 const struct comm *c) {
   struct mesh_t *m = tcalloc(struct mesh_t, 1);
   m->nelt = nelt, m->ndim = ndim, m->nnbrs = ndim;
@@ -37,7 +23,7 @@ static struct mesh_t *mesh_init(int nelt, int ndim, double *coord,
   ulong start = out[0][0];
   m->nelgt = out[1][0];
 
-  int nv = m->nv;
+  uint nv = m->nv;
   array_init(struct point_t, &m->elements, nelt * nv);
   struct point_t p = {.origin = c->id};
   for (uint i = 0; i < nelt; i++) {
@@ -79,39 +65,39 @@ static inline double distance_3d(struct point_t *a, struct point_t *b) {
   return distance_2d(a, b) + diff_sqr(a->x[2], b->x[2]);
 }
 
-int findMinNeighborDistance(Mesh mesh) {
+int find_min_neighbor_distance(Mesh mesh) {
   struct point_t *p = (struct point_t *)mesh->elements.ptr;
-  int ndim = mesh->ndim;
-  int nv = mesh->nv;
+  uint ndim = mesh->ndim;
+  uint nv = mesh->nv;
 
-  uint i, j, k;
-  int neighbor;
-  scalar d;
+  if (ndim < 2 || ndim > 3)
+    return 1;
 
+  uint i, j, k, neighbor;
   if (ndim == 3) {
     for (i = 0; i < mesh->elements.n; i += nv) {
       for (j = 0; j < nv; j++) {
         p[i + j].dx = SCALAR_MAX;
         for (k = 0; k < mesh->nnbrs; k++) {
           neighbor = NEIGHBOR_MAP[j][k];
-          d = distance_3d(&p[i + j], &p[i + neighbor]);
+          scalar d = distance_3d(&p[i + j], &p[i + neighbor]);
           p[i + j].dx = MIN(p[i + j].dx, d);
         }
       }
     }
-  } else if (ndim == 2) {
+  }
+
+  if (ndim == 2) {
     for (i = 0; i < mesh->elements.n; i += nv) {
       for (j = 0; j < nv; j++) {
         p[i + j].dx = SCALAR_MAX;
         for (k = 0; k < mesh->nnbrs; k++) {
           neighbor = NEIGHBOR_MAP[j][k];
-          d = distance_2d(&p[i + j], &p[i + neighbor]);
+          scalar d = distance_2d(&p[i + j], &p[i + neighbor]);
           p[i + j].dx = MIN(p[i + j].dx, d);
         }
       }
     }
-  } else {
-    return 1;
   }
 
   return 0;
@@ -120,7 +106,7 @@ int findMinNeighborDistance(Mesh mesh) {
 //==============================================================================
 // Global numbering
 //
-static int setGlobalID(Mesh mesh, struct comm *c) {
+static int set_global_id(Mesh mesh, struct comm *c) {
   uint nPoints = mesh->elements.n;
   Point points = (struct point_t *)mesh->elements.ptr;
 
@@ -128,9 +114,6 @@ static int setGlobalID(Mesh mesh, struct comm *c) {
   struct comm nonZeroRanks;
   comm_split(c, bin, c->id, &nonZeroRanks);
 
-  sint rank = nonZeroRanks.id;
-  sint size = nonZeroRanks.np;
-
   if (bin == 1) {
     slong count = 0;
     for (uint i = 0; i < nPoints; i++)
@@ -167,7 +150,7 @@ int send_back(Mesh mesh, struct comm *c, buffer *bfr) {
   return 0;
 }
 
-static int transferBoundaryFaces(Mesh mesh, struct comm *c) {
+static int transfer_boundary_faces(Mesh mesh, struct comm *c) {
   uint size = c->np;
 
   struct array *boundary = &mesh->boundary;
@@ -200,24 +183,13 @@ static int transferBoundaryFaces(Mesh mesh, struct comm *c) {
 //==============================================================================
 // C interface to find_conn
 //
-#define check_error(call, msg)                                                 \
-  {                                                                            \
-    sint err = (call);                                                         \
-    sint buf;                                                                  \
-    comm_allreduce(&c, gs_int, gs_max, &err, 1, &buf);                         \
-    if (err) {                                                                 \
-      buffer_free(&bfr), mesh_free(mesh), comm_free(&c);                       \
-      return err;                                                              \
-    }                                                                          \
-  }
-
 // Input:
 //   nelt: Number of elements, nv: Number of vertices in an element
 //   coord [nelt, nv, ndim]: Coordinates of elements vertices in preprocessor
 //     ordering, nv = 8 if ndim == 3 (Hex) or nv = 4 if ndim = 2 (Quad).
 // Output:
 //   vtx[nelt, nv]: Global numbering of vertices of elements
-int parrsb_conn_mesh(long long *vtx, double *coord, int nelt, int ndim,
+int parrsb_conn_mesh(long long *vtx, double *coord, uint nelt, unsigned ndim,
                      long long *pinfo, int npinfo, double tol, MPI_Comm comm) {
   struct comm c;
   comm_init(&c, comm);
@@ -225,73 +197,77 @@ int parrsb_conn_mesh(long long *vtx, double *coord, int nelt, int ndim,
   buffer bfr;
   buffer_init(&bfr, 1024);
 
-  int verbose = 0;
+  int verbose = 1;
   {
     const char *val = getenv("PARRSB_VERBOSE_LEVEL");
     if (val != NULL)
       verbose = atoi(val);
   }
 
-  debug_print(&c, verbose, "Running parCon ...\n");
+  parrsb_print(&c, verbose, "Running parCon ...");
 
   parrsb_barrier(&c);
   double tall = comm_time(), t;
 
   double duration[8] = {0};
-  const char *name[8] = {"transferBoundaryFaces", "findMinNbrDistance   ",
-                         "find_unique_vertices ", "setGlobalId          ",
-                         "elementCheck         ", "faceCheck            ",
-                         "matchPeriodicFaces   ", "copyOutput           "};
+  const char *name[8] = {
+      "transfer_boundary_faces    ", "find_min_neighbor_distance ",
+      "find_unique_vertices       ", "set_global_id              ",
+      "element_check              ", "face_check                 ",
+      "match_periodic_faces       ", "copy_output                "};
 
-  // debug_print(&c, verbose, "\t%s ...");
-  // parrsb_barrier(&c), t = comm_time();
   Mesh mesh = mesh_init(nelt, ndim, coord, pinfo, npinfo, &c);
-  // duration[0] = comm_time() - t;
-  // debug_print(&c, verbose, "done.\n");
 
-  debug_print(&c, verbose, "\t%s ...", name[0]);
+  parrsb_print(&c, verbose - 1, "\t%s ...", name[0]);
   parrsb_barrier(&c), t = comm_time();
-  check_error(transferBoundaryFaces(mesh, &c), name[0]);
+  transfer_boundary_faces(mesh, &c);
   duration[0] = comm_time() - t;
-  debug_print(&c, verbose, "done.\n");
 
-  debug_print(&c, verbose, "\t%s ...", name[1]);
+  parrsb_print(&c, verbose - 1, "\t%s ...", name[1]);
   parrsb_barrier(&c), t = comm_time();
-  check_error(findMinNeighborDistance(mesh), name[1]);
+  find_min_neighbor_distance(mesh);
   duration[1] = comm_time() - t;
-  debug_print(&c, verbose, "done.\n");
 
-  debug_print(&c, verbose, "\t%s ...\n", name[2]);
+  parrsb_print(&c, verbose - 1, "\t%s ...", name[2]);
   parrsb_barrier(&c), t = comm_time();
-  check_error(find_unique_vertices(mesh, &c, tol, verbose, &bfr), name[2]);
+  find_unique_vertices(mesh, &c, tol, verbose - 1, &bfr);
   duration[2] = comm_time() - t;
 
-  debug_print(&c, verbose, "\t%s ...", name[3]);
+  parrsb_print(&c, verbose - 1, "\t%s ...", name[3]);
   parrsb_barrier(&c), t = comm_time();
-  setGlobalID(mesh, &c);
+  set_global_id(mesh, &c);
   send_back(mesh, &c, &bfr);
   duration[3] = comm_time() - t;
-  debug_print(&c, verbose, "done.\n");
 
-  debug_print(&c, verbose, "\t%s ...", name[4]);
+#define check_error(call, msg)                                                 \
+  {                                                                            \
+    sint err = (call), wrk;                                                    \
+    comm_allreduce(&c, gs_int, gs_max, &err, 1, &wrk);                         \
+    if (err) {                                                                 \
+      parrsb_print(&c, 1, msg, __FILE__, __LINE__);                            \
+      buffer_free(&bfr), mesh_free(mesh), comm_free(&c);                       \
+      return err;                                                              \
+    }                                                                          \
+  }
+
+  parrsb_print(&c, verbose - 1, "\t%s ...", name[4]);
   parrsb_barrier(&c), t = comm_time();
-  check_error(elementCheck(mesh, &c, &bfr), name[4]);
+  check_error(element_check(mesh, &c, &bfr), "\t%s:%d element_check failed.");
   duration[4] = comm_time() - t;
-  debug_print(&c, verbose, "done.\n");
 
-  debug_print(&c, verbose, "\t%s ...", name[5]);
+  parrsb_print(&c, verbose - 1, "\t%s ...", name[5]);
   parrsb_barrier(&c), t = comm_time();
-  check_error(faceCheck(mesh, &c, &bfr), name[5]);
+  check_error(face_check(mesh, &c, &bfr), "\t%s:%d face_check failed.");
   duration[5] = comm_time() - t;
-  debug_print(&c, verbose, "done.\n");
 
-  debug_print(&c, verbose, "\t%s ...", name[6]);
+#undef check_error
+
+  parrsb_print(&c, verbose - 1, "\t%s ...", name[6]);
   parrsb_barrier(&c), t = comm_time();
-  check_error(matchPeriodicFaces(mesh, &c, &bfr), name[6]);
+  match_periodic_faces(mesh, &c, verbose - 1, &bfr);
   duration[6] = comm_time() - t;
-  debug_print(&c, verbose, "done.\n");
 
-  debug_print(&c, verbose, "\t%s ...", name[7]);
+  parrsb_print(&c, verbose - 1, "\t%s ...", name[7]);
   parrsb_barrier(&c), t = comm_time();
   Point ptr = mesh->elements.ptr;
   for (uint i = 0; i < nelt; i++) {
@@ -299,32 +275,29 @@ int parrsb_conn_mesh(long long *vtx, double *coord, int nelt, int ndim,
       vtx[i * mesh->nv + j] = ptr[i * mesh->nv + j].globalId + 1;
   }
   duration[7] = comm_time() - t;
-  debug_print(&c, verbose, "done.\n");
 
   // Report timing info and finish
-  double gmin[8], gmax[8], buf[8];
-  for (unsigned i = 0; i < 8; i++)
-    gmax[i] = gmin[i] = duration[i];
-  comm_allreduce(&c, gs_double, gs_min, gmin, 8, buf);
-  comm_allreduce(&c, gs_double, gs_max, gmax, 8, buf);
-
-  if (c.id == 0 && verbose > 1) {
-    for (unsigned i = 0; i < 7; i++)
-      printf("%s: %e %e (min max)\n", name[i], gmin[i], gmax[i]);
-    fflush(stdout);
+  {
+    double gmin[8], gmax[8], buf[8];
+    for (unsigned i = 0; i < 8; i++)
+      gmax[i] = gmin[i] = duration[i];
+    comm_allreduce(&c, gs_double, gs_min, gmin, 8, buf);
+    comm_allreduce(&c, gs_double, gs_max, gmax, 8, buf);
+
+    for (unsigned i = 0; i < 7; i++) {
+      parrsb_print(&c, verbose - 1, "%s: %e %e (min max)", name[i], gmin[i],
+                   gmax[i]);
+    }
   }
 
-  parrsb_barrier(&c), tall = comm_time() - tall;
-  if (c.id == 0) {
-    printf("parCon (tol = %e) finished in %g s\n", tol, tall);
-    fflush(stdout);
-  }
+  parrsb_barrier(&c);
+  tall = comm_time() - tall;
+  parrsb_print(&c, verbose, "parCon (tol = %e) finished in %g s", tol, tall);
 
   buffer_free(&bfr), mesh_free(mesh), comm_free(&c);
 
   return 0;
 }
-#undef check_error
 
 //=============================================================================
 // Fortran interface
diff --git a/src/fiedler.c b/src/fiedler.c
index 17b6425c..f98edbe7 100644
--- a/src/fiedler.c
+++ b/src/fiedler.c
@@ -3,6 +3,9 @@
 #include "parrsb-impl.h"
 #include "sort.h"
 
+#include <math.h>
+#include <time.h>
+
 #define MM 500
 
 extern void matrix_inverse(int N, double *A);
@@ -39,25 +42,24 @@ int power_serial(double *y, uint N, double *A, int verbose) {
   time_t t;
   srand((unsigned)time(&t));
 
-  int i;
   scalar norm = 0.0;
-  for (i = 0; i < N; i++) {
+  for (uint i = 0; i < N; i++) {
     y[i] = (rand() % 50) / 50.0;
     norm += y[i] * y[i];
   }
 
   scalar normi = 1.0 / sqrt(norm);
-  for (i = 0; i < N; i++)
+  for (uint i = 0; i < N; i++)
     y[i] *= normi;
 
   double *Ay = tcalloc(double, N);
-  int j, k, l;
   scalar err = 1.0, lambda;
+  unsigned i;
   for (i = 0; i < 100; i++) {
     norm = 0.0;
-    for (j = 0; j < N; j++) {
+    for (uint j = 0; j < N; j++) {
       Ay[j] = 0.0;
-      for (k = 0; k < N; k++) {
+      for (uint k = 0; k < N; k++) {
         Ay[j] += A[j * N + k] * y[k];
       }
       norm += Ay[j] * Ay[j];
@@ -68,10 +70,10 @@ int power_serial(double *y, uint N, double *A, int verbose) {
     lambda = sqrt(norm);
 
     normi = 1.0 / sqrt(norm);
-    for (j = 0; j < N; j++)
+    for (uint j = 0; j < N; j++)
       y[j] = Ay[j] * normi;
 
-    if (fabs(err) < 1.e-12)
+    if (fabs(err) < 1e-12)
       break;
   }
   free(Ay);
@@ -81,16 +83,16 @@ int power_serial(double *y, uint N, double *A, int verbose) {
 
 int inv_power_serial(double *y, uint N, double *A, int verbose) {
   double *Ainv = tcalloc(double, N *N);
-  int j, k;
-  for (j = 0; j < N; j++) {
-    for (k = 0; k < N; k++)
+  for (uint j = 0; j < N; j++) {
+    for (uint k = 0; k < N; k++)
       Ainv[j * N + k] = A[k * N + j];
   }
 
   matrix_inverse(N, Ainv);
 
+  uint j;
   for (j = 0; j < N; j++) {
-    for (k = 0; k < N; k++)
+    for (uint k = 0; k < N; k++)
       A[j * N + k] = Ainv[k * N + j];
   }
   j = power_serial(y, N, Ainv, verbose);
@@ -101,7 +103,7 @@ int inv_power_serial(double *y, uint N, double *A, int verbose) {
 }
 
 static int project(scalar *x, uint n, scalar *b, struct laplacian *L,
-                   struct mg *d, struct comm *c, int miter, double tol,
+                   struct mg *d, struct comm *c, unsigned miter, double tol,
                    int null_space, int verbose, buffer *bfr) {
   slong out[2][1], buf[2][1], in = n;
   comm_scan(out, c, gs_long, gs_add, &in, 1, buf);
@@ -211,10 +213,9 @@ static int project(scalar *x, uint n, scalar *b, struct laplacian *L,
 
 // Input z should be orthogonal to 1-vector, have unit norm.
 // inverse iteration should not change z.
-static int inverse(scalar *y, struct array *elements, int nv, scalar *z,
-                   struct comm *gsc, int miter, int mpass, double tol,
-                   int factor, int sagg, int grammian, slong nelg,
-                   buffer *buf) {
+static int inverse(scalar *y, struct array *elements, unsigned nv, scalar *z,
+                   struct comm *gsc, unsigned miter, unsigned mpass, double tol,
+                   int factor, int grammian, slong nelg, buffer *buf) {
   metric_tic(gsc, RSB_INVERSE_SETUP);
   uint lelt = elements->n;
   struct rsb_element *elems = (struct rsb_element *)elements->ptr;
@@ -241,7 +242,7 @@ static int inverse(scalar *y, struct array *elements, int nv, scalar *z,
   struct crystal cr;
   crystal_init(&cr, gsc);
   struct par_mat *L = par_csr_setup_con(lelt, eid, vtx, nv, 1, gsc, &cr, buf);
-  struct mg *d = mg_setup(L, factor, sagg, &cr, buf);
+  struct mg *d = mg_setup(L, factor, &cr, buf);
   crystal_free(&cr);
   metric_toc(gsc, RSB_INVERSE_SETUP);
 
@@ -275,7 +276,7 @@ static int inverse(scalar *y, struct array *elements, int nv, scalar *z,
 
     ortho(z, lelt, nelg, gsc);
 
-    int N = i + 1;
+    uint N = i + 1;
     if (grammian == 1) {
       // if k>1;
       //  Z(:,k)=z-Z(:,1:k-1)*(Z(:,1:k-1)'*z);
@@ -378,12 +379,12 @@ static int tqli(scalar *eVectors, scalar *eValues, sint n, scalar *diagonal,
   e[n - 1] = 0.0;
 
   for (i = 0; i < n; i++) {
-    for (uint j = 0; j < n; j++)
+    for (sint j = 0; j < n; j++)
       eVectors[i * n + j] = 0;
     eVectors[i * n + i] = 1;
   }
 
-  int j, k, l, iter, m;
+  sint j, k, l, iter, m;
   for (l = 0; l < n; l++) {
     iter = 0;
     do {
@@ -463,12 +464,12 @@ static int tqli(scalar *eVectors, scalar *eValues, sint n, scalar *diagonal,
 
   for (k = 0; k < n; k++) {
     e[k] = 0;
-    for (uint i = 0; i < n; i++)
+    for (sint i = 0; i < n; i++)
       e[k] += eVectors[k * n + i] * eVectors[k * n + i];
     if (e[k] > 0.0)
       e[k] = sqrt(fabs(e[k]));
     scalar scale = 1.0 / e[k];
-    for (uint i = 0; i < n; i++)
+    for (sint i = 0; i < n; i++)
       eVectors[k * n + i] *= scale;
   }
 
@@ -566,9 +567,9 @@ static int lanczos_aux(scalar *diag, scalar *upper, scalar *rr, uint lelt,
   return iter;
 }
 
-static int lanczos(scalar *fiedler, struct array *elements, int nv,
-                   scalar *initv, struct comm *gsc, int miter, int mpass,
-                   double tol, slong nelg, buffer *bfr) {
+static int lanczos(scalar *fiedler, struct array *elements, unsigned nv,
+                   scalar *initv, struct comm *gsc, unsigned miter,
+                   unsigned mpass, double tol, slong nelg, buffer *bfr) {
   metric_tic(gsc, RSB_LANCZOS_SETUP);
   uint lelt = elements->n;
   struct rsb_element *elems = (struct rsb_element *)elements->ptr;
@@ -582,7 +583,7 @@ static int lanczos(scalar *fiedler, struct array *elements, int nv,
   scalar *rr = tcalloc(scalar, (miter + 1) * lelt);
   scalar *eVectors = tcalloc(scalar, miter * miter);
   scalar *eValues = tcalloc(scalar, miter);
-  int iter = miter, ipass;
+  uint iter = miter, ipass;
   for (ipass = 0; iter == miter && ipass < mpass; ipass++) {
     double t = comm_time();
     iter = lanczos_aux(alpha, beta, rr, lelt, nelg, miter, tol, initv, wl, gsc,
@@ -618,8 +619,12 @@ static int lanczos(scalar *fiedler, struct array *elements, int nv,
   return (ipass - 1) * miter + iter;
 }
 
-int fiedler(struct array *elements, int nv, parrsb_options *opts,
+int fiedler(struct array *elements, int nv, const parrsb_options *const opts,
             struct comm *gsc, buffer *buf, int verbose) {
+  // Return if the number of processes is equal to 1.
+  if (gsc->np == 1)
+    return 0;
+
   metric_tic(gsc, RSB_FIEDLER_SETUP);
   uint lelt = elements->n;
   slong out[2][1], wrk[2][1], in = lelt;
@@ -653,7 +658,7 @@ int fiedler(struct array *elements, int nv, parrsb_options *opts,
   case 1:
     iter = inverse(f, elements, nv, initv, gsc, opts->rsb_max_iter,
                    opts->rsb_max_passes, opts->rsb_tol, opts->rsb_mg_factor,
-                   opts->rsb_mg_sagg, opts->rsb_mg_grammian, nelg, buf);
+                   opts->rsb_mg_grammian, nelg, buf);
     break;
   default:
     break;
diff --git a/src/helpers.c b/src/helpers.c
index 123b5f78..46c282f3 100644
--- a/src/helpers.c
+++ b/src/helpers.c
@@ -10,25 +10,21 @@
 void parrsb_print_stack(void) {
   void *bt[50];
   int bt_size = backtrace(bt, 50);
+  if (bt_size == 0) {
+    fprintf(stderr, "backtrace(): Obtained 0 stack frames.\n");
+    return;
+  }
+
   char **symbols = backtrace_symbols(bt, bt_size);
-  printf("backtrace(): obtained %d stack frames.\n", bt_size);
-  for (unsigned i = 0; i < bt_size; i++)
-    printf("%s\n", symbols[i]);
+  fprintf(stderr, "backtrace(): obtained %d stack frames.\n", bt_size);
+  for (unsigned i = 0; i < (unsigned)bt_size; i++)
+    fprintf(stderr, "%s\n", symbols[i]);
   free(symbols);
 }
 #else
-void parrsb_print_stack(){};
+void parrsb_print_stack() {}
 #endif // defined __GLIBC__
 
-double parrsb_get_max_rss() {
-  struct rusage r_usage;
-  getrusage(RUSAGE_SELF, &r_usage);
-#if defined(__APPLE__) && defined(__MACH__)
-  return (double)r_usage.ru_maxrss;
-#else
-  return (double)(r_usage.ru_maxrss * 1024L);
-#endif
-}
 int log2ll(long long n) {
   int k = 0;
   while (n > 1)
@@ -38,7 +34,7 @@ int log2ll(long long n) {
 }
 
 int parrsb_dist_mesh(unsigned int *nelt_, long long **vl_, double **coord_,
-                     int *part, int nv, MPI_Comm comm) {
+                     int *part, unsigned nv, MPI_Comm comm) {
   typedef struct {
     int proc;
     long long vtx[MAXNV];
@@ -60,7 +56,7 @@ int parrsb_dist_mesh(unsigned int *nelt_, long long **vl_, double **coord_,
   }
   assert(elements.n == nelt);
 
-  int ndim = (nv == 8) ? 3 : 2;
+  unsigned ndim = (nv == 8) ? 3 : 2;
   elem_data *ed = elements.ptr;
   double *coord = (coord_ == NULL ? NULL : *coord_);
   if (coord != NULL) {
@@ -126,7 +122,7 @@ int parrsb_setup_mesh(unsigned *nelt, unsigned *nv, long long **vl,
   parrsb_check_error(err, comm);
 
   parrsb_options opt = parrsb_default_options;
-  err = parrsb_part_mesh(part, NULL, *vl, *coord, *nelt, *nv, opt, comm);
+  err = parrsb_part_mesh(part, *vl, *coord, NULL, *nelt, *nv, &opt, comm);
   parrsb_check_error(err, comm);
 
   // Redistribute data based on identified partitions
@@ -143,16 +139,13 @@ void parrsb_get_part_stat(int *nc, int *ns, int *nss, int *nel, long long *vtx,
   struct comm comm;
   comm_init(&comm, ce);
 
-  int np = comm.np;
-  int id = comm.id;
-
+  uint np = comm.np;
   if (np == 1)
     return;
 
-  int Npts = nelt * nv;
-  int i;
+  size_t Npts = nelt * nv;
   slong *data = (slong *)malloc((Npts + 1) * sizeof(slong));
-  for (i = 0; i < Npts; i++)
+  for (size_t i = 0; i < Npts; i++)
     data[i] = vtx[i];
   struct gs_data *gsh = gs_setup(data, Npts, &comm, 0, gs_pairwise, 0);
 
@@ -165,11 +158,11 @@ void parrsb_get_part_stat(int *nc, int *ns, int *nss, int *nel, long long *vtx,
   gs_free(gsh);
   free(data);
 
-  int nelMin, nelMax, nelSum;
-  int ncMin, ncMax, ncSum;
-  int nsMin, nsMax, nsSum;
-  int nssMin, nssMax, nssSum;
-  int b;
+  sint nelMin, nelMax, nelSum;
+  sint ncMin, ncMax, ncSum;
+  sint nsMin, nsMax, nsSum;
+  sint nssMin, nssMax, nssSum;
+  sint b;
 
   ncMax = Nmsg;
   ncMin = Nmsg;
@@ -181,7 +174,7 @@ void parrsb_get_part_stat(int *nc, int *ns, int *nss, int *nel, long long *vtx,
   nsMax = Ncomm[0];
   nsMin = Ncomm[0];
   nsSum = Ncomm[0];
-  for (i = 1; i < Nmsg; ++i) {
+  for (int i = 1; i < Nmsg; ++i) {
     nsMax = Ncomm[i] > Ncomm[i - 1] ? Ncomm[i] : Ncomm[i - 1];
     nsMin = Ncomm[i] < Ncomm[i - 1] ? Ncomm[i] : Ncomm[i - 1];
     nsSum += Ncomm[i];
@@ -265,23 +258,15 @@ parrsb_cmd_line_opts *parrsb_parse_cmd_opts(int argc, char *argv[]) {
 
   in->mesh = NULL, in->tol = 2e-1;
   in->test = 0, in->dump = 0, in->verbose = 0, in->nactive = INT_MAX;
-  in->ilu_type = 0, in->ilu_tol = 1e-1, in->ilu_pivot = 0;
-  in->crs_type = 0, in->crs_tol = 1e-3;
-
-  static struct option long_options[] = {
-      {"mesh", required_argument, 0, 0},
-      {"tol", optional_argument, 0, 1},
-      {"test", optional_argument, 0, 2},
-      {"dump", optional_argument, 0, 3},
-      {"nactive", optional_argument, 0, 4},
-      {"verbose", optional_argument, 0, 5},
-      {"ilu_type", optional_argument, 0, 10},
-      {"ilu_tol", optional_argument, 0, 11},
-      {"ilu_pivot", optional_argument, 0, 12},
-      {"crs_type", optional_argument, 0, 20},
-      {"crs_tol", optional_argument, 0, 21},
-      {"help", optional_argument, 0, 91},
-      {0, 0, 0, 0}};
+
+  static struct option long_options[] = {{"mesh", required_argument, 0, 0},
+                                         {"tol", optional_argument, 0, 10},
+                                         {"test", optional_argument, 0, 20},
+                                         {"dump", optional_argument, 0, 30},
+                                         {"nactive", optional_argument, 0, 40},
+                                         {"verbose", optional_argument, 0, 50},
+                                         {"help", optional_argument, 0, 99},
+                                         {0, 0, 0, 0}};
 
   size_t len;
   for (;;) {
@@ -295,37 +280,22 @@ parrsb_cmd_line_opts *parrsb_parse_cmd_opts(int argc, char *argv[]) {
       in->mesh = tcalloc(char, len + 1);
       strncpy(in->mesh, optarg, len);
       break;
-    case 1:
+    case 10:
       in->tol = atof(optarg);
       break;
-    case 2:
+    case 20:
       in->test = 1;
       break;
-    case 3:
+    case 30:
       in->dump = 1;
       break;
-    case 4:
+    case 40:
       in->nactive = atoi(optarg);
       break;
-    case 5:
+    case 50:
       in->verbose = atoi(optarg);
       break;
-    case 10:
-      in->ilu_type = atoi(optarg);
-      break;
-    case 11:
-      in->ilu_tol = atof(optarg);
-      break;
-    case 12:
-      in->ilu_pivot = atoi(optarg);
-      break;
-    case 20:
-      in->crs_type = atoi(optarg);
-      break;
-    case 21:
-      in->crs_tol = atof(optarg);
-      break;
-    case 91:
+    case 99:
       print_help();
       break;
     default:
@@ -400,7 +370,7 @@ int parrsb_vector_dump(const char *fname, scalar *y, struct rsb_element *elm,
 
   slong out[2][1], in = nelt;
   comm_scan(out, c, gs_long, gs_add, &in, 1, wrk);
-  slong start = out[0][0], nelgt = out[1][0];
+  slong nelgt = out[1][0];
 
   int ndim = (nv == 8) ? 3 : 2;
   uint write_size = ((ndim + 1) * sizeof(double) + sizeof(slong)) * nelt;
diff --git a/src/ilu.c b/src/ilu.c
deleted file mode 100644
index ac26f42b..00000000
--- a/src/ilu.c
+++ /dev/null
@@ -1,1513 +0,0 @@
-#include "ilu.h"
-#include <math.h>
-
-#define CSC 0
-#define CSR 1
-
-//=============================================================================
-// ILU levels
-//
-// Currently there are two methods of finding levels
-//   1. Based on final element distribution among processors (dst_lvls)
-//   2. Based on RSB levels while partitioning (rsb_lvls)
-struct key_t {
-  ulong e;
-  uint p;
-};
-
-struct e2n_t {
-  ulong e, n;
-};
-
-struct request_t {
-  ulong r;
-  uint p, o;
-};
-
-static int find_unique_nbrs(struct array *e2nm, uint n, int nv,
-                            const ulong *ids, const slong *vtx,
-                            struct crystal *cr, buffer *bfr) {
-  struct array nbrs;
-  find_nbrs(&nbrs, ids, vtx, n, nv, cr, bfr);
-
-  array_init(struct e2n_t, e2nm, n * 10);
-  if (nbrs.n > 0) {
-    sarray_sort_2(struct nbr, nbrs.ptr, nbrs.n, r, 1, c, 1, bfr);
-    struct nbr *pn = (struct nbr *)nbrs.ptr;
-
-    struct e2n_t en;
-    uint i, j;
-    for (i = 1, j = 0; i < nbrs.n; i++) {
-      if ((pn[i].r != pn[j].r) || (pn[i].c != pn[j].c)) {
-        en.e = pn[j].r, en.n = pn[j].c;
-        array_cat(struct e2n_t, e2nm, &en, 1);
-        j = i;
-      }
-    }
-    en.e = pn[j].r, en.n = pn[j].c;
-    array_cat(struct e2n_t, e2nm, &en, 1);
-    sarray_sort_2(struct e2n_t, e2nm->ptr, e2nm->n, e, 1, n, 1, bfr);
-  }
-  array_free(&nbrs);
-
-  return 0;
-}
-
-static int local_dof(const ulong *rows, const ulong I, const uint n) {
-  for (uint i = 0; i < n; i++)
-    if (rows[i] == I)
-      return i;
-  return n;
-}
-
-// Fill dofs array with unique dofs found in this processr
-static int update_keys(struct array *keys, struct array *nbrs, const uint ln,
-                       const ulong *lids, struct crystal *cr, buffer *bfr) {
-  uint i, j;
-  struct array temp, rqst;
-  array_init(struct request_t, &temp, nbrs->n);
-  array_init(struct request_t, &rqst, nbrs->n);
-
-  struct comm *c = &cr->comm;
-  struct e2n_t *pn = (struct e2n_t *)nbrs->ptr;
-  struct request_t t;
-  for (i = 0; i < nbrs->n; i++) {
-    t.r = pn[i].n, t.p = t.r % c->np;
-    t.o = (local_dof(lids, t.r, ln) < ln);
-    array_cat(struct request_t, &temp, &t, 1);
-  }
-
-  struct request_t *pt = (struct request_t *)temp.ptr;
-  if (temp.n > 0) {
-    sarray_sort(struct request_t, temp.ptr, temp.n, r, 1, bfr);
-    for (i = 1, j = 0; i < temp.n; i++) {
-      if (pt[i].r != pt[j].r) {
-        array_cat(struct request_t, &rqst, &pt[j], 1);
-        j = i;
-      }
-    }
-    array_cat(struct request_t, &rqst, &pt[j], 1);
-  }
-
-  sarray_transfer(struct request_t, &rqst, p, 1, cr);
-  sarray_sort_2(struct request_t, rqst.ptr, rqst.n, r, 1, o, 0, bfr);
-
-  struct request_t *pr = (struct request_t *)rqst.ptr;
-  if (rqst.n > 0) {
-    for (i = 1, j = 0; i < rqst.n; i++) {
-      if (pr[i].r != pr[j].r) {
-        // owner for dof j, j + 1, ... i - 1 is pr[i - 1].p
-        assert(pr[i - 1].o == 1);
-        for (; j < i; j++)
-          pr[j].o = pr[i - 1].p;
-        // j = i at the end
-      }
-    }
-    assert(pr[i - 1].o == 1);
-    for (; j < i; j++)
-      pr[j].o = pr[i - 1].p;
-  }
-
-  sarray_transfer(struct request_t, &rqst, o, 0, cr);
-  sarray_sort_2(struct request_t, rqst.ptr, rqst.n, r, 1, p, 0, bfr);
-
-  // All the requests are forwarded correctly. Send the data back
-  // to the requesting processors. Note that the requests are unique.
-  struct key_t *pk = (struct key_t *)keys->ptr;
-  pr = (struct request_t *)rqst.ptr;
-  temp.n = 0;
-  for (i = j = 0; i < rqst.n; i++) {
-    while (pk[j].e < pr[i].r)
-      j++;
-    // Sanity check
-    assert(pk[j].e == pr[i].r);
-    t.o = pr[i].p;
-    for (uint k = j; k < keys->n && pk[k].e == pk[j].e; k++) {
-      t.r = pk[k].e, t.p = pk[k].p;
-      array_cat(struct request_t, &temp, &t, 1);
-    }
-  }
-  array_free(&rqst);
-
-  sarray_transfer(struct request_t, &temp, o, 0, cr);
-  sarray_sort_2(struct request_t, temp.ptr, temp.n, r, 1, p, 0, bfr);
-
-  // Update the keys array. Update here is a complete rewrite.
-  struct array keyt;
-  array_init(struct key_t, &keyt, temp.n);
-
-  struct key_t s;
-  pt = (struct request_t *)temp.ptr;
-  for (i = 0; i < ln; i++) {
-    ulong e = lids[i];
-    // Find `e` in the nbrs array
-    for (j = 0; j < nbrs->n && pn[j].e < e; j++)
-      ;
-    assert(j < nbrs->n && pn[j].e == e);
-    // Now go through all the neighbors and update the keys
-    for (; j < nbrs->n && pn[j].e == e; j++) {
-      ulong n = pn[j].n;
-      // find the key of `n` in temp
-      uint k = 0;
-      for (; k < temp.n && pt[k].r < n; k++)
-        ;
-      assert(k < temp.n && pt[k].r == n);
-      for (; k < temp.n && pt[k].r == n; k++) {
-        s.e = e, s.p = pt[k].p;
-        array_cat(struct key_t, &keyt, &s, 1);
-      }
-    }
-  }
-  array_free(&temp);
-
-  keys->n = 0;
-  if (keyt.n > 0) {
-    sarray_sort_2(struct key_t, keyt.ptr, keyt.n, e, 1, p, 0, bfr);
-    pk = (struct key_t *)keyt.ptr;
-    for (i = 1, j = 0; i < keyt.n; i++) {
-      if ((pk[i].e != pk[j].e) || (pk[i].p != pk[j].p)) {
-        array_cat(struct key_t, keys, &pk[j], 1);
-        j = i;
-      }
-    }
-    array_cat(struct key_t, keys, &pk[j], 1);
-  }
-
-  array_free(&keyt);
-
-  return 0;
-}
-
-// This routine will update `lvl_n`, `lvl_off` and `lvl_ids` with the DOF
-// belongig to current level. In the process, it will remove the DOFs and their
-// connectivity from ids, and vtx arrays. `n` will be adjusted to reflect
-// changes.
-static int dst_lvls_aux(int *lvl_n, uint *lvl_off, uint *lvl_owner,
-                        ulong *lvl_ids, uint *n, ulong *ids, slong *vtx, int nv,
-                        struct array *keys, struct comm *c, int verbose) {
-  // Find the min key size locally.
-  uint i, j, k;
-  sint min = INT_MAX;
-  struct key_t *pk = (struct key_t *)keys->ptr;
-  if (keys->n > 0) {
-    for (i = 1, j = 0; i < keys->n; i++) {
-      if (pk[i].e != pk[j].e) {
-        // Different element, update min key size if required
-        min = (min > i - j ? i - j : min);
-        j = i;
-      }
-    }
-    min = (min > i - j ? i - j : min);
-  }
-
-  sint buf[2];
-  comm_allreduce(c, gs_int, gs_min, &min, 1, buf);
-  if (min == INT_MAX)
-    return 0;
-
-  int lvl = *lvl_n;
-  uint off = lvl_off[lvl];
-  if (keys->n > 0) {
-    for (i = 1, j = 0; i < keys->n; i++) {
-      if (pk[i].e != pk[j].e) {
-        if (i - j == min)
-          lvl_ids[off] = pk[j].e, lvl_owner[off] = pk[i - 1].p, off++;
-        j = i;
-      }
-    }
-    if (i - j == min)
-      lvl_ids[off] = pk[j].e, lvl_owner[off] = pk[i - 1].p, off++;
-  }
-
-  assert(lvl < 50);
-  lvl++, lvl_off[lvl] = off;
-  if (verbose > 1) {
-    printf("id: %d |key| = %d lvl = %d size = %u\n", c->id, min, lvl,
-           lvl_off[lvl] - lvl_off[lvl - 1]);
-    fflush(stdout);
-  }
-
-  // Now we have to update ids and vtx. This can be done in place.
-  for (i = lvl_off[lvl - 1], j = 0, k = 0; i < lvl_off[lvl]; i++, j++) {
-    for (; j < *n && ids[j] < lvl_ids[i]; j++, k++) {
-      ids[k] = ids[j];
-      for (int v = 0; v < nv; v++)
-        vtx[k * nv + v] = vtx[j * nv + v];
-    }
-    assert(j < *n && ids[j] == lvl_ids[i]);
-  }
-  for (; j < *n; j++, k++) {
-    ids[k] = ids[j];
-    for (int v = 0; v < nv; v++)
-      vtx[k * nv + v] = vtx[j * nv + v];
-  }
-
-  *n -= lvl_off[lvl] - lvl_off[lvl - 1], *lvl_n = lvl;
-
-  return 0;
-}
-
-static int dst_lvls(uint *lvl_off, uint *lvl_owner, ulong *lvl_ids,
-                    const uint n_, const int nv, const ulong *ids_,
-                    const slong *vtx_, struct crystal *cr, int verbose,
-                    buffer *bfr) {
-  // Copy ids and vtx since we are going to modify them
-  uint n = n_;
-  ulong *ids = tcalloc(ulong, n);
-  slong *vtx = tcalloc(slong, n * nv);
-  for (uint i = 0, j = 0; i < n; i++) {
-    ids[i] = ids_[i];
-    for (int v = 0; v < nv; v++, j++)
-      vtx[j] = vtx_[j];
-  }
-
-  struct comm *c = &cr->comm;
-
-  // Initialize keys: set key of each dof to the current MPI rank.
-  // keys array should has unique entries and should be sorted first
-  // by .e and then by .p.
-  struct array keys;
-  array_init(struct key_t, &keys, n);
-  struct key_t e2p = {.e = 0, .p = c->id};
-  for (uint i = 0; i < n; i++) {
-    e2p.e = ids[i];
-    array_cat(struct key_t, &keys, &e2p, 1);
-  }
-  sarray_sort_2(struct key_t, keys.ptr, keys.n, e, 1, p, 0, bfr);
-
-  slong ng = n, buf[2];
-  comm_allreduce(c, gs_long, gs_add, &ng, 1, buf);
-
-  int nlvls = 0;
-  struct array nbrs;
-  while (ng > 0) {
-    // Find unique neighbors of a DOF. DOF is a neighbor of itself.
-    find_unique_nbrs(&nbrs, n, nv, ids, vtx, cr, bfr);
-
-    // Send and receive key to/from neighbors. We forward all the requests
-    // for the key of a DOF to the processor that owns the DOF and then that
-    // processor takes care of the request. To do that, we first find all the
-    // unique requests.
-    update_keys(&keys, &nbrs, n, ids, cr, bfr);
-
-    // Find the min key size
-    // Add all the dofs with key size equal to min key size to current level
-    // Update ids and vtx by removing the dofs with min key size
-    dst_lvls_aux(&nlvls, lvl_off, lvl_owner, lvl_ids, &n, ids, vtx, nv, &keys,
-                 c, verbose);
-
-    ng = n;
-    comm_allreduce(c, gs_long, gs_add, &ng, 1, buf);
-    if (verbose > 1) {
-      if (c->id == 0)
-        printf("lvl = %d ng = %lld\n", nlvls, ng);
-      fflush(stdout);
-    }
-    array_free(&nbrs);
-  }
-
-  free(ids), free(vtx);
-
-  return nlvls;
-}
-
-static int rsb_lvls(uint *lvl_off, uint *lvl_owner, ulong *lvl_ids,
-                    const uint n, const int nv, const ulong *ids,
-                    const slong *vtx, struct comm *ci, int verbose,
-                    buffer *bfr) {
-  slong ng = n, buf[2];
-  comm_allreduce(ci, gs_long, gs_add, &ng, 1, buf);
-
-  // What we are going to do is identify the elements in the interface at each
-  // level. These elements constitute the level of ILU. Owner of the element is
-  // the processor which at least own a single vertex (possibly duplicated) of
-  // the element.
-
-  uint size = n * nv;
-  sint *in = tcalloc(sint, size);
-  sint *lvl = tcalloc(sint, n);
-  sint *owner = tcalloc(sint, n);
-  if (owner == NULL || lvl == NULL || in == NULL) {
-    fprintf(stderr, "Failed to allocate lvl, owner or in !\n");
-    exit(1);
-  }
-
-  struct comm c, t;
-  comm_dup(&c, ci);
-
-  uint i;
-  sint nlvls = 1, j;
-  while (c.np > 1) {
-    struct gs_data *gsh = gs_setup(vtx, size, &c, 0, gs_pairwise, 0);
-
-    int bin = (c.id >= (c.np + 1) / 2);
-    for (i = 0; i < size; i++)
-      in[i] = bin;
-
-    gs(in, gs_int, gs_max, 0, gsh, bfr);
-
-    if (bin == 1) {
-      for (i = 0; i < size; i++)
-        in[i] = 0;
-    }
-
-    gs(in, gs_int, gs_max, 0, gsh, bfr);
-
-    sint ownr = 0;
-    for (i = 0; i < n; i++) {
-      for (j = 0; j < nv; j++) {
-        if (in[i * nv + j] > 0) {
-          if (lvl[i] == 0) {
-            lvl[i] = nlvls;
-            ownr = ci->id + 1;
-          }
-          break;
-        }
-      }
-    }
-
-    comm_allreduce(&c, gs_int, gs_max, &ownr, 1, buf);
-
-    for (i = 0; i < n; i++) {
-      if (lvl[i] == nlvls)
-        owner[i] = ownr - 1;
-    }
-
-    nlvls++;
-
-    gs_free(gsh);
-    comm_split(&c, bin, c.id, &t), comm_free(&c);
-    comm_dup(&c, &t), comm_free(&t);
-  }
-  comm_free(&c);
-
-  int rem = 0;
-  for (uint i = 0; i < n; i++) {
-    if (lvl[i] == 0) {
-      lvl[i] = nlvls;
-      owner[i] = ci->id;
-      rem = 1;
-    }
-  }
-  nlvls += rem;
-  comm_allreduce(ci, gs_int, gs_max, &nlvls, 1, buf);
-
-  // Reverse the level numbers
-  for (uint i = 0; i < n; i++)
-    lvl[i] = nlvls - lvl[i];
-
-  struct linfo_t {
-    uint lvl, owner;
-    ulong id;
-  };
-
-  struct array linfos;
-  array_init(struct linfo_t, &linfos, n);
-
-  struct linfo_t linfo = {.lvl = 0, .owner = 0, .id = 0};
-  for (uint i = 0; i < n; i++) {
-    linfo.lvl = lvl[i], linfo.owner = owner[i], linfo.id = ids[i];
-    array_cat(struct linfo_t, &linfos, &linfo, 1);
-  }
-  sarray_sort(struct linfo_t, linfos.ptr, linfos.n, lvl, 0, bfr);
-
-  if (linfos.n > 0) {
-    struct linfo_t *pl = (struct linfo_t *)linfos.ptr;
-    for (uint l = 0, i = 0; l < nlvls; l++) {
-      for (; i < linfos.n && pl[i].lvl == l; i++)
-        lvl_ids[i] = pl[i].id, lvl_owner[i] = pl[i].owner;
-      lvl_off[l + 1] = i;
-    }
-  }
-
-  array_free(&linfos);
-  free(owner), free(lvl), free(in);
-
-  return nlvls;
-}
-
-static int find_lvls(uint *lvl_off, uint *lvl_owner, ulong *lvl_ids,
-                     const uint n, const int nv, const ulong *ids,
-                     const slong *vtx, int type, struct crystal *cr,
-                     int verbose, buffer *bfr) {
-  int nlvls = 0;
-  switch (type) {
-  case 0:
-    nlvls = dst_lvls(lvl_off, lvl_owner, lvl_ids, n, nv, ids, vtx, cr, verbose,
-                     bfr);
-    break;
-  case 1:
-    nlvls = rsb_lvls(lvl_off, lvl_owner, lvl_ids, n, nv, ids, vtx, &cr->comm,
-                     verbose, bfr);
-    break;
-  default:
-    break;
-  }
-  return nlvls;
-}
-
-//=============================================================================
-// ILU
-//
-struct ilu {
-  int pivot, verbose;
-  // 1st dropping rule: An entry a_ij is dropped abs(a_ij) < tol
-  scalar tol;
-  // 2nd dropping rule: Entries are dropped so that total nnz per row/col < p
-  uint nnz_per_row;
-
-  // Calculated values internal to ILU
-  uint nlvls, *lvl_off;
-  ulong *perm;
-  struct par_mat A, L, U;
-  struct crystal cr;
-};
-
-//=============================================================================
-// ILU(0)
-//
-static int ilu0_get_rows(struct par_mat *E, int lvl, uint *lvl_off,
-                         struct par_mat *A, struct crystal *cr, buffer *bfr) {
-  struct owner {
-    ulong ri;
-    uint rp, p;
-  };
-
-  assert(IS_CSR(A) && !IS_DIAG(A));
-
-  struct array owners, requests;
-  array_init(struct owner, &owners, A->rn * 30);
-  array_init(struct owner, &requests, A->rn * 30);
-
-  struct comm *c = &cr->comm;
-  struct owner t;
-  for (uint i = lvl_off[lvl - 1]; i < lvl_off[lvl]; i++) {
-    ulong I = A->rows[i];
-    for (uint j = A->adj_off[i];
-         j < A->adj_off[i + 1] && A->cols[A->adj_idx[j]] < I; j++) {
-      t.ri = A->cols[A->adj_idx[j]], t.rp = c->np, t.p = t.ri % c->np;
-      array_cat(struct owner, &owners, &t, 1);
-    }
-  }
-
-  for (uint i = lvl_off[0]; i < lvl_off[lvl]; i++) {
-    t.ri = A->rows[i], t.rp = c->id, t.p = t.ri % c->np;
-    array_cat(struct owner, &owners, &t, 1);
-  }
-
-  sarray_sort_2(struct owner, owners.ptr, owners.n, ri, 1, rp, 0, bfr);
-  struct owner *ptr = (struct owner *)owners.ptr;
-  uint i, j;
-  for (i = 0; i < owners.n; i = j) {
-    for (j = i + 1; j < owners.n && ptr[j].ri == ptr[i].ri; j++)
-      ;
-    array_cat(struct owner, &requests, &ptr[i], 1);
-  }
-  array_free(&owners);
-
-  // Match row ids and set `p` to the original processor
-  sarray_transfer(struct owner, &requests, p, 1, cr);
-
-  // Set rp to the owner
-  sarray_sort_2(struct owner, requests.ptr, requests.n, ri, 1, rp, 0, bfr);
-  ptr = (struct owner *)requests.ptr;
-  for (i = 0; i < requests.n; i = j) {
-    assert(ptr[i].rp < c->np);
-    for (j = i + 1; j < requests.n && ptr[j].ri == ptr[i].ri; j++) {
-      assert(ptr[j].rp == c->np);
-      ptr[j].rp = ptr[i].rp;
-    }
-  }
-
-  // Forward requests to the owner processor
-  sarray_transfer(struct owner, &requests, rp, 0, cr);
-
-  sarray_sort_2(struct owner, requests.ptr, requests.n, ri, 1, p, 0, bfr);
-  ptr = (struct owner *)requests.ptr;
-
-  struct array sends;
-  array_init(struct mij, &sends, A->rn * 30);
-
-  for (i = 0; i < requests.n; i = j) {
-    ulong ri = ptr[i].ri;
-    uint ro = local_dof(A->rows, ri, A->rn);
-    assert(ro < A->rn);
-    for (j = i; j < requests.n && ptr[j].ri == ri; j++) {
-      // No need to send to owner
-      if (ptr[j].p != c->id) {
-        // copy_row(&sends, ro, ptr[j].p, A);
-        struct mij m = {.r = A->rows[ro], .idx = 0, .p = ptr[j].p};
-        for (uint k = A->adj_off[ro], ke = A->adj_off[ro + 1]; k < ke; k++) {
-          m.c = A->cols[A->adj_idx[k]], m.v = A->adj_val[k];
-          array_cat(struct mij, &sends, &m, 1);
-        }
-      }
-    }
-  }
-  array_free(&requests);
-
-  sarray_transfer(struct mij, &sends, p, 1, cr);
-  par_csr_setup(E, &sends, 0, bfr);
-  array_free(&sends);
-
-  return 0;
-}
-
-static void ilu0_update_row(const uint io, const uint k, struct par_mat *A,
-                            struct par_mat *E, int verbose, int lvl) {
-  uint *off = A->adj_off, *idx = A->adj_idx;
-  uint *koff = A->adj_off, *kidx = A->adj_idx;
-  ulong *cols = A->cols, *kcols = A->cols;
-  scalar *val = A->adj_val, *kval = A->adj_val;
-
-  const ulong K = cols[idx[k]];
-  const ulong I = A->rows[io];
-
-  // Find offsets of K in A
-  sint ko = -1;
-  uint j;
-  for (j = 0; j < A->rn; j++) {
-    if (A->rows[j] == K) {
-      ko = j;
-      break;
-    }
-  }
-
-  // Search in E if K is not found in A
-  if (ko == -1 && E != NULL) {
-    koff = E->adj_off, kidx = E->adj_idx;
-    kval = E->adj_val, kcols = E->cols;
-    for (j = 0; j < E->rn; j++) {
-      if (E->rows[j] == K) {
-        ko = j;
-        break;
-      }
-    }
-  }
-
-  // Oops, K is no where to be found
-  if (ko == -1) {
-    fprintf(stderr, "%s:%d lvl = %d, k = %u ko = %d\n", __FILE__, __LINE__, lvl,
-            k, ko);
-    exit(1);
-  }
-
-  // Calculate a_ik = a_ik / a_kk
-  scalar a_kk = 0;
-  for (j = koff[ko]; j < koff[ko + 1]; j++) {
-    if (kcols[kidx[j]] == K) {
-      a_kk = kval[j];
-      break;
-    }
-  }
-
-  if (fabs(a_kk) < 1e-10) {
-    fprintf(stderr, "%s:%d ilu0: Diagonal is zero ! k = %llu\n", __FILE__,
-            __LINE__, K);
-    exit(1);
-  }
-
-  // cols[idx[k]] = K and val[k] = a_ik
-  scalar a_ik = val[k] / a_kk;
-  if (verbose) {
-    printf("a_kk = %lf a_ik = %lf a_ik/a_kk = %lf\n", a_kk, val[j], a_ik);
-    fflush(stdout);
-  }
-  val[k] = a_ik;
-
-  uint kj;
-  scalar a_kj;
-  for (j = k + 1; j < off[io + 1]; j++) {
-    for (kj = koff[ko]; kj < koff[ko + 1] && kcols[kidx[kj]] < cols[idx[j]];
-         kj++)
-      ;
-    if (kj < koff[ko + 1] && kcols[kidx[kj]] == cols[idx[j]])
-      a_kj = kval[kj];
-    else
-      a_kj = 0;
-
-    if (verbose) {
-      printf("a_ij = %lf a_ik = %lf a_kj = %lf\n", val[j], a_ik, a_kj);
-      fflush(stdout);
-    }
-    // a_ij = a_ij - a_ik * a_kj
-    val[j] -= a_ik * a_kj;
-  }
-}
-
-static void ilu0_level(int lvl, uint *lvl_off, struct par_mat *A,
-                       struct par_mat *E, int verbose) {
-  ulong *cols = A->cols, *rows = A->rows;
-  uint *off = A->adj_off, *idx = A->adj_idx, i, k;
-  for (i = lvl_off[lvl - 1] + (lvl == 1); i < lvl_off[lvl]; i++)
-    for (k = off[i]; k < off[i + 1] && cols[idx[k]] < rows[i]; k++)
-      ilu0_update_row(i, k, A, E, verbose, lvl);
-}
-
-static void ilu0(struct ilu *ilu, buffer *bfr) {
-  ilu0_level(1, ilu->lvl_off, &ilu->A, NULL, 0);
-  struct par_mat E;
-  for (int l = 2; l <= ilu->nlvls; l++) {
-    ilu0_get_rows(&E, l, ilu->lvl_off, &ilu->A, &ilu->cr, bfr);
-    ilu0_level(l, ilu->lvl_off, &ilu->A, &E, 0);
-    par_mat_free(&E);
-  }
-}
-
-//=============================================================================
-// ILUC
-//
-struct eij_t {
-  ulong r, c;
-  uint p;
-  scalar v;
-};
-
-// We are going to separate A matrix to L and U where L is the strictly lower
-// triangular part of A and U is the upper triangular part of A (including the
-// diagonal). Since A is in CSR format, extracting U (in CSR format) is easy.
-// L will be distributed by columns and we need to figure out the owner of a
-// given column.
-static void iluc_sep_lu(struct ilu *ilu, buffer *bfr) {
-  // Recover the communicator
-  struct crystal *cr = &ilu->cr;
-  struct comm *c = &cr->comm;
-
-  // Setup U
-  struct par_mat *A = &ilu->A;
-  struct array uijs, lijs;
-  array_init(struct mij, &uijs, A->rn * 30);
-  array_init(struct mij, &lijs, A->rn * 30);
-
-  struct mij m = {.r = 0, .c = 0, .idx = 0, .p = 0, .v = 0};
-  uint i, j, je;
-  for (i = 0; i < A->rn; i++) {
-    m.r = A->rows[i];
-    j = A->adj_off[i], je = A->adj_off[i + 1];
-    for (; j < je && A->cols[A->adj_idx[j]] < m.r; j++) {
-      m.c = A->cols[A->adj_idx[j]], m.v = A->adj_val[j];
-      m.p = m.c % c->np, m.idx = (local_dof(A->rows, m.c, A->rn) < A->rn);
-      array_cat(struct mij, &lijs, &m, 1);
-    }
-    // Add the unit diagonal to L (We actually don't need to send this)
-    m.c = m.r, m.v = 1, m.p = m.c % c->np, m.idx = 1;
-    array_cat(struct mij, &lijs, &m, 1);
-
-    for (; j < je; j++) {
-      m.c = A->cols[A->adj_idx[j]], m.v = A->adj_val[j];
-      array_cat(struct mij, &uijs, &m, 1);
-    }
-  }
-
-  par_mat_setup(&ilu->U, &uijs, CSR, 0, bfr);
-  array_free(&uijs);
-
-  // Setup L
-  sarray_transfer(struct mij, &lijs, p, 1, cr);
-  if (lijs.n > 0) {
-    sarray_sort_2(struct mij, lijs.ptr, lijs.n, c, 1, idx, 0, bfr);
-    struct mij *pl = (struct mij *)lijs.ptr;
-    for (i = 1, j = 0; i < lijs.n; i++) {
-      if (pl[i].c != pl[j].c) {
-        assert(pl[i - 1].idx == 1);
-        for (; j < i; j++)
-          pl[j].p = pl[i - 1].p;
-        // j == i at the end
-      }
-    }
-    // residual
-    assert(pl[i - 1].idx == 1);
-    for (; j < i; j++)
-      pl[j].p = pl[i - 1].p;
-  }
-
-  sarray_transfer(struct mij, &lijs, p, 0, cr);
-  par_mat_setup(&ilu->L, &lijs, CSC, 0, bfr);
-  array_free(&lijs);
-}
-
-static void iluc_fwrd_rqsts(struct array *fwds, struct array *rqsts,
-                            const int type, const ulong K,
-                            const struct array *A, struct crystal *cr,
-                            buffer *bfr) {
-  fwds->n = rqsts->n = 0;
-  struct request_t t = {.r = 0, .p = 0, .o = 1};
-
-  struct comm *c = &cr->comm;
-
-#define INIT_RQST(f, g, arr)                                                   \
-  do {                                                                         \
-    if (A->n > 0) {                                                            \
-      sarray_sort_2(struct mij, A->ptr, A->n, f, 1, g, 1, bfr);                \
-      struct mij *pa = (struct mij *)A->ptr;                                   \
-      uint i = 1, j = 0;                                                       \
-      for (; i < A->n; i++) {                                                  \
-        if (pa[i].f != pa[j].f) {                                              \
-          t.r = pa[j].f, t.p = t.r % c->np;                                    \
-          array_cat(struct request_t, arr, &t, 1);                             \
-          j = i;                                                               \
-        }                                                                      \
-      }                                                                        \
-      if (j < i) {                                                             \
-        t.r = pa[j].f, t.p = t.r % c->np;                                      \
-        array_cat(struct request_t, arr, &t, 1);                               \
-      }                                                                        \
-    }                                                                          \
-  } while (0)
-
-  if (type == CSC)
-    INIT_RQST(r, c, rqsts);
-  else
-    INIT_RQST(c, r, rqsts);
-#undef INIT_RQST
-
-  if (K > 0) {
-    t.r = K, t.p = K % c->np, t.o = 0;
-    array_cat(struct request_t, rqsts, &t, 1);
-  }
-
-  sarray_transfer(struct request_t, rqsts, p, 1, cr);
-
-  // Okay, we got all the requests (if any) and non-zero row/col ids in the same
-  // processor. Now we forward the requests to the original owners.
-  if (rqsts->n > 0) {
-    sarray_sort_2(struct request_t, rqsts->ptr, rqsts->n, r, 1, o, 0, bfr);
-    struct request_t *pr = (struct request_t *)rqsts->ptr;
-    uint s = 0, e = 1;
-    for (; e < rqsts->n; e++) {
-      if (pr[e].r != pr[s].r) {
-        if (pr[s].o == 0) { // This is a request
-          uint p = pr[s].p;
-          for (s = s + 1; s < e; s++) {
-            pr[s].o = p;
-            array_cat(struct request_t, fwds, &pr[s], 1);
-          }
-        }
-        s = e;
-      }
-    }
-    if (s < e && pr[s].o == 0) {
-      uint p = pr[s].p;
-      for (s = s + 1; s < e; s++) {
-        pr[s].o = p;
-        array_cat(struct request_t, fwds, &pr[s], 1);
-      }
-    }
-  }
-
-  sarray_transfer(struct request_t, fwds, p, 0, cr);
-}
-
-static void iluc_send_data(struct array *data, const int type, struct array *A,
-                           struct array *work, struct crystal *cr,
-                           buffer *bfr) {
-  if (type == CSC) {
-    sarray_sort_2(struct mij, A->ptr, A->n, c, 1, r, 1, bfr);
-    sarray_sort_2(struct eij_t, work->ptr, work->n, r, 1, c, 1, bfr);
-  } else {
-    sarray_sort_2(struct mij, A->ptr, A->n, r, 1, c, 1, bfr);
-    sarray_sort_2(struct eij_t, work->ptr, work->n, c, 1, r, 1, bfr);
-  }
-
-  // We only have one request per processor, so sorting by processor is the
-  // same as sorting by row id. But just to be safe we will sort by row id.
-  data->n = 0;
-  if (work->n > 0) {
-    struct eij_t *pw = (struct eij_t *)work->ptr;
-    uint i = 1, j = 0;
-    for (; i < work->n; i++) {
-      if ((pw[i].r != pw[j].r) || (pw[i].c != pw[j].c)) {
-        array_cat(struct eij_t, data, &pw[j], 1);
-        j = i;
-      } else
-        pw[j].v += pw[i].v;
-    }
-    if (j < i)
-      array_cat(struct eij_t, data, &pw[j], 1);
-  }
-
-  sarray_transfer(struct eij_t, data, p, 0, cr);
-}
-
-static void iluc_get_data(struct array *data, ulong K, int type,
-                          struct array *A, struct array *B, struct crystal *cr,
-                          struct array *rqsts, struct array *fwds,
-                          struct array *work, buffer *bfr) {
-  iluc_fwrd_rqsts(fwds, rqsts, type, K, A, cr, bfr);
-
-  work->n = 0;
-  if (fwds->n > 0) {
-    sarray_sort(struct request_t, fwds->ptr, fwds->n, r, 1, bfr);
-    struct request_t *pf = (struct request_t *)fwds->ptr;
-
-    uint i, j, k, l, n;
-    scalar v;
-    struct eij_t m = {.r = 0, .c = 0, .p = 0, .v = 0};
-
-#define FILL_RQST(f, g, nd)                                                    \
-  do {                                                                         \
-    struct mij *pa = (struct mij *)A->ptr;                                     \
-    struct mij *pb = (struct mij *)B->ptr;                                     \
-    for (i = 0, j = 0; i < fwds->n; i++) {                                     \
-      l = 0;                                                                   \
-      m.f = pf[i].r, m.p = pf[i].o;                                            \
-      for (; j < A->n && pa[j].f < m.f; j++)                                   \
-        ;                                                                      \
-      assert(j < A->n && pa[j].f == m.f);                                      \
-      for (k = j; k < A->n && pa[k].f == m.f && pa[k].g < m.f; k++) {          \
-        v = pa[k].v;                                                           \
-        for (; l < B->n && pb[l].f < pa[k].g; l++)                             \
-          ;                                                                    \
-        assert(l < B->n && pb[l].f == pa[k].g);                                \
-        for (n = l; n < B->n && pb[n].f == pa[k].g && (pb[n].g < m.f + nd);    \
-             n++)                                                              \
-          ;                                                                    \
-        for (; n < B->n && pb[n].f == pa[k].g; n++) {                          \
-          m.g = pb[n].g, m.v = -v * pb[n].v;                                   \
-          array_cat(struct eij_t, work, &m, 1);                                \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-  } while (0)
-
-    if (type == CSC)
-      FILL_RQST(r, c, 0);
-    else
-      FILL_RQST(c, r, 1);
-
-#undef FILL_RQST
-  }
-
-  iluc_send_data(data, type, A, work, cr, bfr);
-}
-
-static void iluc_update(struct array *tij, ulong K, struct array *data, int row,
-                        buffer *bfr) {
-  // FIXME: This can be done more efficiently
-  struct mij m = {.r = 0, .c = 0, .idx = 0, .p = 0, .v = 0};
-  uint j;
-  if (K) {
-    if (row) {
-      sarray_sort(struct eij_t, data->ptr, data->n, c, 1, bfr);
-      struct eij_t *pd = (struct eij_t *)data->ptr;
-      m.r = K;
-      for (j = 0; j < data->n; j++) {
-        m.c = pd[j].c, m.v = pd[j].v;
-        array_cat(struct mij, tij, &m, 1);
-      }
-    } else {
-      sarray_sort(struct eij_t, data->ptr, data->n, r, 1, bfr);
-      struct eij_t *pd = (struct eij_t *)data->ptr;
-      m.c = K;
-      for (; j < data->n; j++) {
-        m.r = pd[j].r, m.v = pd[j].v;
-        array_cat(struct mij, tij, &m, 1);
-      }
-    }
-  }
-
-  struct array tmp;
-  array_init(struct mij, &tmp, tij->n + 1);
-
-  if (tij->n > 0) {
-    uint i = 1, j = 0;
-    struct mij *pt = NULL;
-    if (row) {
-      sarray_sort(struct mij, tij->ptr, tij->n, c, 1, bfr);
-      pt = (struct mij *)tij->ptr;
-      for (; i < tij->n; i++) {
-        if (pt[i].c != pt[j].c) {
-          array_cat(struct mij, &tmp, &pt[j], 1);
-          j = i;
-        } else
-          pt[j].v += pt[i].v;
-      }
-    } else {
-      sarray_sort(struct mij, tij->ptr, tij->n, r, 1, bfr);
-      pt = (struct mij *)tij->ptr;
-      for (; i < tij->n; i++) {
-        if (pt[i].r != pt[j].r) {
-          array_cat(struct mij, &tmp, &pt[j], 1);
-          j = i;
-        } else
-          pt[j].v += pt[i].v;
-      }
-    }
-    if (j < i && pt)
-      array_cat(struct mij, &tmp, &pt[j], 1);
-
-    tij->n = 0;
-    array_cat(struct mij, tij, tmp.ptr, tmp.n);
-  }
-
-  array_free(&tmp);
-}
-
-static void iluc_level(struct array *lij, struct array *uij, int lvl,
-                       struct ilu *ilu, struct array *data, struct array *work,
-                       buffer *bfr) {
-  // Work arrays
-  struct array rij, cij;
-  array_init(struct mij, &rij, 30);
-  array_init(struct mij, &cij, 30);
-
-  struct array rqst, fwds;
-  array_init(struct request_t, &rqst, 30);
-  array_init(struct request_t, &fwds, 30);
-
-  struct par_mat *L = &ilu->L, *U = &ilu->U;
-  struct crystal *cr = &ilu->cr;
-
-  // Figure out start and end of the level and agree on a range
-  uint *lvl_off = ilu->lvl_off, s = lvl_off[lvl - 1];
-  sint buf[2], size = lvl_off[lvl] - s;
-  comm_allreduce(&cr->comm, gs_int, gs_max, &size, 1, buf);
-  uint e = s + size;
-
-  uint i, j, je, k;
-  for (k = s; k < e; k++) {
-    ulong K = (k < lvl_off[lvl]) ? U->rows[k] : 0;
-
-    // Fetch required data (combine with the other call below)
-    iluc_get_data(data, K, CSC, lij, uij, cr, &rqst, &fwds, work, bfr);
-
-    // Init z[1:K] = 0, z[K:n] = a_{K, K:n}, i.e., z = u_{K,:}
-    rij.n = 0;
-    if (K) {
-      struct mij m = {.r = K, .c = 0, .idx = 0, .p = 0, .v = 0};
-      for (j = U->adj_off[k], je = U->adj_off[k + 1]; j < je; j++) {
-        m.c = U->cols[U->adj_idx[j]], m.v = U->adj_val[j];
-        array_cat(struct mij, &rij, &m, 1);
-      }
-    }
-    // Update z if l_KI != 0 for all I, 1 <= I < K
-    iluc_update(&rij, K, data, 1, bfr);
-
-    // Fetch required data (combine with the other call above)
-    iluc_get_data(data, K, CSR, uij, lij, cr, &rqst, &fwds, work, bfr);
-
-    // Init w[1:K] = 0, w[K] = 1, w[K+1:n] = a_{K+1:n, K}, i.e., w = l_{:, K}
-    cij.n = 0;
-    if (K) {
-      struct mij m = {.r = 0, .c = K, .idx = 0, .p = 0, .v = 0};
-      for (j = L->adj_off[k] + 1, je = L->adj_off[k + 1]; j < je; j++) {
-        m.r = L->rows[L->adj_idx[j]], m.v = L->adj_val[j];
-        array_cat(struct mij, &cij, &m, 1);
-      }
-    }
-    // Update w if u_IK != 0 for all I, 1 <= I < K
-    iluc_update(&cij, K, data, 0, bfr);
-
-    // Set u_{k, :} = z and find u_kk
-    scalar u_kk = 1;
-    struct mij *pt = (struct mij *)rij.ptr;
-    if (K) {
-      if (rij.n > 0 && fabs(pt[0].v) > 1e-12)
-        u_kk = pt[0].v;
-      array_cat(struct mij, uij, rij.ptr, rij.n);
-    }
-
-    // Set l_{:, K} = w/u_KK and l_KK = 1
-    pt = (struct mij *)cij.ptr;
-    for (j = 0; j < cij.n; j++)
-      pt[j].v /= u_kk;
-
-    if (K) {
-      struct mij m = {.r = K, .c = K, .idx = 0, .p = 0, .v = 1};
-      array_cat(struct mij, &cij, &m, 1);
-      array_cat(struct mij, lij, cij.ptr, cij.n);
-    }
-  }
-
-  array_free(&rij), array_free(&cij);
-  array_free(&rqst), array_free(&fwds);
-}
-
-//=============================================================================
-// ILUCP
-//
-struct pivot_t {
-  ulong k;
-  uint p, pivot;
-};
-
-static void ilucp_get_data(struct array *data, ulong P, int type,
-                           struct array *A, ulong K, struct array *B,
-                           struct array *pvts, struct crystal *cr,
-                           struct array *rqsts, struct array *fwds,
-                           struct array *work, buffer *bfr) {
-  iluc_fwrd_rqsts(fwds, rqsts, type, P, A, cr, bfr);
-
-  work->n = 0;
-  if (fwds->n > 0) {
-    sarray_sort(struct request_t, fwds->ptr, fwds->n, r, 1, bfr);
-    struct request_t *pf = (struct request_t *)fwds->ptr;
-
-    uint i, j, k, l, n, o;
-    scalar v;
-    struct eij_t m = {.r = 0, .c = 0, .p = 0, .v = 0};
-
-#define FILL_RQST(f, g, nd)                                                    \
-  do {                                                                         \
-    struct mij *pa = (struct mij *)A->ptr;                                     \
-    struct mij *pb = (struct mij *)B->ptr;                                     \
-    for (i = 0, j = 0; i < fwds->n; i++) {                                     \
-      l = 0;                                                                   \
-      m.f = pf[i].r, m.p = pf[i].o;                                            \
-      for (; j < A->n && pa[j].f < m.f; j++)                                   \
-        ;                                                                      \
-      assert(j < A->n && pa[j].f == m.f);                                      \
-      for (k = j; k < A->n && pa[k].f == m.f && pa[k].g < K; k++) {            \
-        v = pa[k].v;                                                           \
-        for (; l < B->n && pb[l].f < pa[k].g; l++)                             \
-          ;                                                                    \
-        assert(l < B->n && pb[l].f == pa[k].g);                                \
-        for (n = l; n < B->n && pb[n].f == pa[k].g && (pb[n].g < K + nd); n++) \
-          ;                                                                    \
-        if (pvts != NULL) {                                                    \
-          struct pivot_t *pp = (struct pivot_t *)pvts->ptr;                    \
-          o = 0;                                                               \
-          for (; n < B->n && pb[n].f == pa[k].g; n++) {                        \
-            m.g = pb[n].g, m.v = -v * pb[n].v;                                 \
-            while (o < pvts->n && pp[o].k < m.g)                               \
-              o++;                                                             \
-            assert(o < pvts->n && pp[o].k == m.g);                             \
-            if (!pp[o].pivot)                                                  \
-              array_cat(struct eij_t, work, &m, 1);                            \
-          }                                                                    \
-        } else {                                                               \
-          for (; n < B->n && pb[n].f == pa[k].g; n++) {                        \
-            m.g = pb[n].g, m.v = -v * pb[n].v;                                 \
-            array_cat(struct eij_t, work, &m, 1);                              \
-          }                                                                    \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-  } while (0)
-
-    if (type == CSC)
-      FILL_RQST(r, c, 0);
-    else
-      FILL_RQST(c, r, 1);
-
-#undef FILL_RQST
-  }
-
-  iluc_send_data(data, type, A, work, cr, bfr);
-}
-
-static ulong ilucp_find_pvt(ulong *perm, uint k, int lvl, uint *lvl_off,
-                            struct array *row, struct crystal *cr,
-                            buffer *bfr) {
-  // First sort by the absolute value and then setup a gs handle to iteratively
-  // select a pivot
-  ulong p = 0;
-  if (k < lvl_off[lvl]) {
-    scalar v = 0;
-    struct mij *pr = (struct mij *)row->ptr;
-    for (uint i = 0; i < row->n && pr[i].c < lvl_off[lvl]; i++) {
-      if (fabs(pr[i].v) > v) {
-        v = fabs(pr[i].v);
-        p = pr[i].c;
-      }
-    }
-    perm[k] = p;
-  }
-  return p;
-}
-
-static void ilucp_update_pvts(struct array *pvts, struct array *rij,
-                              ulong *perm, uint k, int lvl, uint *lvl_off,
-                              struct crystal *cr, buffer *bfr) {
-  struct comm *c = &cr->comm;
-
-  struct pivot_t t = {.k = 0, .pivot = 0};
-  struct mij *pr = (struct mij *)rij->ptr;
-  for (uint i = 0; i < rij->n; i++) {
-    t.k = pr[i].c, t.p = t.k % c->np;
-    array_cat(struct pivot_t, pvts, &t, 1);
-  }
-
-  uint e = (k < lvl_off[lvl] ? k : lvl_off[lvl]);
-  t.pivot = 1;
-  for (uint i = 0; i < e; i++) {
-    t.k = perm[i], t.p = t.k % c->np;
-    array_cat(struct pivot_t, pvts, &t, 1);
-  }
-
-  if (pvts->n > 0) {
-    struct array temp;
-    array_init(struct pivot_t, &temp, pvts->n + 1);
-
-    sarray_sort_2(struct pivot_t, pvts->ptr, pvts->n, k, 1, pivot, 1, bfr);
-    struct pivot_t *pp = (struct pivot_t *)pvts->ptr;
-    uint i = 1, j = 0;
-    for (; i < pvts->n; i++) {
-      if (pp[i].k != pp[j].k) {
-        array_cat(struct pivot_t, &temp, &pp[i - 1], 1);
-        j = i;
-      }
-    }
-    if (j < i)
-      array_cat(struct pivot_t, &temp, &pp[i - 1], 1);
-    pvts->n = 0;
-    array_cat(struct pivot_t, pvts, temp.ptr, temp.n);
-    array_free(&temp);
-  }
-
-  sarray_transfer(struct pivot_t, pvts, p, 1, cr);
-  sarray_sort_2(struct pivot_t, pvts->ptr, pvts->n, k, 1, pivot, 0, bfr);
-
-  if (pvts->n > 0) {
-    struct pivot_t *pp = (struct pivot_t *)pvts->ptr;
-    uint i = 1, j = 0;
-    for (; i < pvts->n; i++) {
-      if (pp[i].k != pp[j].k) {
-        for (; j < i - 1; j++)
-          pp[j].pivot = pp[i - 1].pivot;
-        j = i;
-      }
-    }
-    if (j < i) {
-      for (; j < i - 1; j++)
-        pp[j].pivot = pp[i - 1].pivot;
-    }
-  }
-
-  sarray_transfer(struct pivot_t, pvts, p, 1, cr);
-  sarray_sort(struct pivot_t, pvts->ptr, pvts->n, k, 1, bfr);
-}
-
-static void ilucp_level(struct array *lij, struct array *uij, int lvl,
-                        struct ilu *ilu, struct array *pvts, struct array *data,
-                        struct array *work, buffer *bfr) {
-  // Work arrays
-  struct array rij, cij;
-  array_init(struct mij, &rij, 30);
-  array_init(struct mij, &cij, 30);
-
-  struct array rqst, fwds;
-  array_init(struct request_t, &rqst, 30);
-  array_init(struct request_t, &fwds, 30);
-
-  struct par_mat *L = &ilu->L, *U = &ilu->U;
-  struct crystal *cr = &ilu->cr;
-
-  // Figure out start and end of the level and agree on a range
-  uint *lvl_off = ilu->lvl_off, s = lvl_off[lvl - 1];
-  sint buf[2], size = lvl_off[lvl] - s;
-  comm_allreduce(&cr->comm, gs_int, gs_max, &size, 1, buf);
-  uint e = s + size;
-
-  uint i, j, je, k, l;
-  for (k = s; k < e; k++) {
-    ulong K = (k < lvl_off[lvl]) ? U->rows[k] : 0;
-
-    // Fetch required data. We will skip the data in the  columns which were
-    // choosen as pivots.
-    ilucp_get_data(data, K, CSC, lij, K, uij, pvts, cr, &rqst, &fwds, work,
-                   bfr);
-
-    // Init z[1:K] = 0, z[K:n] = a_{K, K:n}, i.e., z = u_{K,:} and skip the
-    // columns which have been choosen as pivots.
-    rij.n = 0;
-    if (K) {
-      struct mij m = {.r = K, .c = 0, .idx = 0, .p = 0, .v = 0};
-      struct pivot_t *pp = (struct pivot_t *)pvts->ptr;
-      for (j = U->adj_off[k], je = U->adj_off[k + 1], l = 0; j < je; j++) {
-        m.c = U->cols[U->adj_idx[j]], m.v = U->adj_val[j];
-        while (l < pvts->n && pp[l].k < m.c)
-          l++;
-        assert(pp[l].k == m.c);
-        if (!pp[l].pivot)
-          array_cat(struct mij, &rij, &m, 1);
-      }
-    }
-
-    // Update z if l_KI != 0 for all I, 1 <= I < K
-    iluc_update(&rij, K, data, 1, bfr);
-
-    // Select the pivot now -- all the active processors have to agree on their
-    // own pivot. If two processors share the same pivot, smallest one wins and
-    // others have to concede and find another one. So we will send a pivot
-    // candidate list and make each processor pick one. Right now the candidate
-    // list = updated row.
-    ulong P = ilucp_find_pvt(ilu->perm, k, lvl, lvl_off, &rij, cr, bfr);
-
-    // Sync the pivots: Basically everyone gets updated about which cols of U
-    // have become pivots. Can't be done through a gs call, will have to send
-    // all the cols in U and the current row a_k along with the info if its
-    // a pivot.
-    ilucp_update_pvts(pvts, &rij, ilu->perm, k, lvl, lvl_off, cr, bfr);
-
-    // Fetch required data for col updated. Can't combine with above call when
-    // we pivot? Will need to reimplement this part
-    ilucp_get_data(data, P, CSR, uij, K, lij, NULL, cr, &rqst, &fwds, work,
-                   bfr);
-
-    // Init w[1:K] = 0, w[K] = 1, w[K+1:n] = a_{K+1:n, K}, i.e., w = l_{:, K}
-    cij.n = 0;
-    if (K) {
-      struct mij m = {.r = 0, .c = K, .idx = 0, .p = 0, .v = 0};
-      for (j = L->adj_off[k] + 1, je = L->adj_off[k + 1]; j < je; j++) {
-        m.r = L->rows[L->adj_idx[j]], m.v = L->adj_val[j];
-        array_cat(struct mij, &cij, &m, 1);
-      }
-    }
-    // Update w if u_IK != 0 for all I, 1 <= I < K
-    iluc_update(&cij, K, data, 0, bfr);
-
-    // Set u_{k, :} = z and find u_kk
-    // FIXME: This should u_{perm[k],perm[k]}}, not u_kk
-    scalar u_kk = 1;
-    struct mij *pt = (struct mij *)rij.ptr;
-    if (K) {
-      if (rij.n > 0 && fabs(pt[0].v) > 1e-12)
-        u_kk = pt[0].v;
-      array_cat(struct mij, uij, rij.ptr, rij.n);
-    }
-
-    // Set l_{:, K} = w/u_KK and l_KK = 1
-    pt = (struct mij *)cij.ptr;
-    for (j = 0; j < cij.n; j++)
-      pt[j].v /= u_kk;
-
-    if (K) {
-      struct mij m = {.r = K, .c = K, .idx = 0, .p = 0, .v = 1};
-      array_cat(struct mij, &cij, &m, 1);
-      array_cat(struct mij, lij, cij.ptr, cij.n);
-    }
-  }
-
-  array_free(&rij), array_free(&cij), array_free(&rqst), array_free(&fwds);
-}
-
-static void iluc(struct ilu *ilu, buffer *bfr) {
-  struct crystal *cr = &ilu->cr;
-  struct comm *c = &cr->comm;
-
-  // Setup L and U
-  iluc_sep_lu(ilu, bfr);
-
-  struct par_mat *A = &ilu->A, *L = &ilu->L, *U = &ilu->U;
-
-  struct array uij, lij, data, work;
-  array_init(struct mij, &uij, A->rn * 30 + 1);
-  array_init(struct mij, &lij, A->rn * 30 + 1);
-  array_init(struct eij_t, &data, A->rn * 30 + 1);
-  array_init(struct eij_t, &work, A->rn * 30 + 1);
-
-  struct array pvts;
-  array_init(struct pivot_t, &pvts, L->cn + 1);
-
-  if (ilu->pivot) {
-    ilu->perm = tcalloc(ulong, A->rn);
-    // Initialize with the columns of U, i.e, columns of L
-    struct pivot_t t = {.k = 0, .p = 0, .pivot = 0};
-    for (uint i = 0; i < U->cn; i++) {
-      t.k = U->cols[i], t.p = t.k % c->np;
-      array_cat(struct pivot_t, &pvts, &t, 1);
-    }
-
-    for (int l = 1; l <= ilu->nlvls; l++)
-      ilucp_level(&lij, &uij, l, ilu, &pvts, &data, &work, bfr);
-  } else {
-    for (int l = 1; l <= ilu->nlvls; l++)
-      iluc_level(&lij, &uij, l, ilu, &data, &work, bfr);
-  }
-
-  par_mat_free(L), par_mat_free(U);
-  par_mat_setup(U, &uij, CSR, 0, bfr);
-  par_mat_setup(L, &lij, CSC, 0, bfr);
-
-  const char *val = getenv("PARRSB_DUMP_ILU");
-  if (val != NULL && atoi(val) != 0) {
-    par_mat_dump("LL.txt", L, cr, bfr);
-    par_mat_dump("UU.txt", U, cr, bfr);
-  }
-
-  array_free(&pvts);
-  array_free(&lij), array_free(&uij);
-  array_free(&work), array_free(&data);
-}
-
-//=============================================================================
-// ILU API related functions
-//
-// `vtx` array is in the order of sorted element ids
-static int ilu_setup_aux(struct ilu *ilu, int nlvls, uint *lvl_off,
-                         uint *lvl_owner, ulong *lvl_ids, const uint n,
-                         const int nv, const slong *vtx, const int verbose,
-                         buffer *bfr) {
-  struct elm {
-    slong vtx[8];
-    uint p, lvl;
-    ulong e;
-  };
-
-  struct crystal *cr = &ilu->cr;
-  struct comm *c = &cr->comm;
-
-  // Send the elements in each level to the owner
-  struct array elms;
-  array_init(struct elm, &elms, n);
-
-  struct elm elm;
-  for (int l = 0; l < nlvls; l++) {
-    for (uint i = lvl_off[l]; i < lvl_off[l + 1]; i++) {
-      elm.lvl = l + 1, elm.e = lvl_ids[i], elm.p = lvl_owner[i];
-      array_cat(struct elm, &elms, &elm, 1);
-    }
-  }
-  sarray_sort(struct elm, elms.ptr, elms.n, e, 1, bfr);
-
-  struct elm *pe = (struct elm *)elms.ptr;
-  if (elms.n > 0) {
-    // Sanity check
-    assert(elms.n == n);
-    for (uint i = 0; i < n; i++) {
-      for (int v = 0; v < nv; v++)
-        pe[i].vtx[v] = vtx[i * nv + v];
-    }
-  }
-
-  sarray_transfer(struct elm, &elms, p, 1, cr);
-  sarray_sort_2(struct elm, elms.ptr, elms.n, lvl, 0, e, 1, bfr);
-
-  // Setup the ILU structure: allocate ILU data structures.
-  ilu->nlvls = nlvls;
-  ilu->lvl_off = (uint *)tcalloc(uint, ilu->nlvls + 1);
-
-  uint s = 0, e = 0;
-  ilu->lvl_off[0] = s;
-  pe = (struct elm *)elms.ptr;
-  for (int l = 1; l <= ilu->nlvls; l++) {
-    while (e < elms.n && pe[e].lvl == l)
-      e++;
-    ilu->lvl_off[l] = ilu->lvl_off[l - 1] + e - s;
-    s = e;
-  }
-
-  // Number rows now: All the elements in Level 0 are numbered before Level
-  // 1 and so on.
-  ulong *ids = trealloc(ulong, ids, elms.n);
-  ulong ng = 0;
-  for (int l = 0; l < ilu->nlvls; l++) {
-    e = ilu->lvl_off[l + 1], s = ilu->lvl_off[l];
-    slong out[2][1], buf[2][1], in = e - s;
-    comm_scan(out, c, gs_long, gs_add, &in, 1, buf);
-    ulong start = ng + out[0][0] + 1;
-    for (; s < e; s++)
-      ids[s] = start++;
-    ng += out[1][0];
-  }
-
-  slong *vrt = tcalloc(slong, elms.n * nv);
-  for (uint i = 0; i < elms.n; i++) {
-    for (int j = 0; j < nv; j++)
-      vrt[i * nv + j] = pe[i].vtx[j];
-  }
-
-  if (verbose > 1) {
-    for (uint i = 0; i < elms.n; i++) {
-      printf("fid = %llu, ", ids[i]);
-      for (int v = 0; v < nv; v++)
-        printf("%lld, ", vrt[i * nv + v]);
-      printf("\n");
-      fflush(stdout);
-    }
-  }
-
-  // Find and compress neighbors in order to form the Laplacian
-  struct array nbrs, eij;
-  find_nbrs(&nbrs, ids, vrt, elms.n, nv, cr, bfr);
-  compress_nbrs(&eij, &nbrs, bfr);
-  free(ids), free(vrt);
-  array_free(&elms), array_free(&nbrs);
-
-  // Setup the parallel CSR matrix
-  par_csr_setup(&ilu->A, &eij, 0, bfr);
-  array_free(&eij);
-
-  return 0;
-}
-
-struct ilu *ilu_setup(const uint n, const int nv, const long long *llvtx,
-                      const ilu_options *options, MPI_Comm comm) {
-  struct comm c;
-  comm_init(&c, comm);
-
-  struct ilu *ilu = tcalloc(struct ilu, 1);
-  ilu->pivot = options->pivot, ilu->verbose = options->verbose;
-  ilu->tol = options->tol, ilu->nnz_per_row = options->nnz_per_row;
-  ilu->lvl_off = NULL, ilu->perm = NULL;
-  crystal_init(&ilu->cr, &c);
-
-  slong *vtx = tcalloc(slong, n * nv);
-  for (uint i = 0; i < n * nv; i++)
-    vtx[i] = llvtx[i];
-
-  // Establish a numbering based on input
-  slong out[2][1], buf[2][1], in = n;
-  comm_scan(out, &c, gs_long, gs_add, &in, 1, buf);
-  ulong s = out[0][0], ng = out[1][0];
-
-  ulong *ids = tcalloc(ulong, n);
-  for (uint i = 0; i < n; i++)
-    ids[i] = s + i + 1;
-
-  buffer bfr;
-  buffer_init(&bfr, 1024);
-
-  uint *lvl_off = tcalloc(uint, 100 + n), *lvl_owner = lvl_off + 100;
-  ulong *lvl_ids = tcalloc(ulong, n);
-  int nlvls = find_lvls(lvl_off, lvl_owner, lvl_ids, n, nv, ids, vtx, 1,
-                        &ilu->cr, ilu->verbose, &bfr);
-  ilu_setup_aux(ilu, nlvls, lvl_off, lvl_owner, lvl_ids, n, nv, vtx,
-                ilu->verbose, &bfr);
-
-  char *val = getenv("PARRSB_DUMP_ILU");
-  if (val != NULL && atoi(val) != 0)
-    par_mat_dump("A.txt", &ilu->A, &ilu->cr, &bfr);
-
-  // Setup the ILU factors
-  switch (options->type) {
-  case 0:
-    ilu0(ilu, &bfr);
-    break;
-  case 1:
-    iluc(ilu, &bfr);
-    break;
-  default:
-    break;
-  }
-
-  val = getenv("PARRSB_DUMP_ILU");
-  if (val != NULL && atoi(val) != 0)
-    par_mat_dump("B.txt", &ilu->A, &ilu->cr, &bfr);
-
-  free(ids), free(vtx), free(lvl_off), free(lvl_ids);
-  buffer_free(&bfr), comm_free(&c);
-
-  return ilu;
-}
-
-void ilu_free(struct ilu *ilu) {
-  if (ilu) {
-    crystal_free(&ilu->cr);
-    if (ilu->nlvls > 0) {
-      par_mat_free(&ilu->A);
-      // FIXME: Cleanup L and U
-      // par_mat_free(&ilu->L);
-      // par_mat_free(&ilu->U);
-    }
-    if (ilu->lvl_off)
-      free(ilu->lvl_off), ilu->lvl_off = NULL;
-    if (ilu->perm)
-      free(ilu->perm), ilu->perm = NULL;
-    free(ilu);
-  }
-}
-
-#undef CSC
-#undef CSR
diff --git a/src/ilu.h b/src/ilu.h
deleted file mode 100644
index 1460240e..00000000
--- a/src/ilu.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _PARRSB_ILU_H_
-#define _PARRSB_ILU_H_
-
-#include "mat.h"
-
-typedef struct {
-  // ILU type: ILU(0), ILUC, etc.
-  int type;
-  // Verbose level: 0, 1, etc.
-  int verbose;
-  // Use pivoting or not: 0 or 1
-  int pivot;
-  // 1st dropping rule: An entry a_ij is dropped abs(a_ij) < tol
-  scalar tol;
-  // 2nd dropping rule: Entries are dropped so that total nnz per row/col < p
-  unsigned int nnz_per_row;
-} ilu_options;
-
-struct ilu;
-struct ilu *ilu_setup(const uint n, const int nv, const long long *vtx,
-                      const ilu_options *options, MPI_Comm comm);
-void ilu_free(struct ilu *ilu);
-
-#endif
diff --git a/src/io.c b/src/io.c
index 6aebe026..47dec216 100644
--- a/src/io.c
+++ b/src/io.c
@@ -1,5 +1,7 @@
 #include "parrsb-impl.h"
 
+#include <math.h>
+
 #define READ_T(coords, buf, T, nv)                                             \
   { memcpy((coords), buf, sizeof(T) * nv); }
 
@@ -62,17 +64,12 @@ static void re2_header(unsigned *nelt_, unsigned *nv_, ulong *nelgt_,
 
 static void re2_coord(double **coord_, unsigned int nelt, int nv, MPI_File file,
                       struct comm *c) {
-  uint rank = c->id, size = c->np;
-
-  slong out[2][1], bfr[2][1], in = nelt;
-  comm_scan(out, c, gs_long, gs_add, &in, 1, bfr);
-  slong start = out[0][0];
-
-  int ndim = (nv == 4) ? 2 : 3;
+  unsigned ndim = (nv == 4) ? 2 : 3;
   size_t elem_size = nv * ndim * sizeof(double) + sizeof(double);
   size_t header_size = GC_RE2_HEADER_LEN + sizeof(float);
 
   // Calculate read size for element data on each MPI rank.
+  uint rank = c->id;
   size_t read_size = nelt * elem_size + (rank == 0) * header_size;
   char *buf = (char *)calloc(read_size, sizeof(char));
   MPI_Status st;
@@ -220,8 +217,6 @@ static void re2_boundary(unsigned int *nbcs_, long long **bcs_,
 static void read_geometry(unsigned *nelt, unsigned *nv, double **coord,
                           unsigned *nbcs, long long **bcs, char *fname,
                           struct comm *c) {
-  uint rank = c->id, size = c->np;
-
   MPI_Info info;
   check_mpi_call(MPI_Info_create(&info), "MPI_Info_create", c);
 
@@ -258,12 +253,12 @@ static int read_connectivity(unsigned int *nelt_, unsigned *nv_,
   err = MPI_File_read_all(file, buf, GC_CO2_HEADER_LEN, MPI_BYTE, &st);
 
   long long nelgt, nelgv;
-  int nv;
+  unsigned nv;
   char version[6];
-  sscanf(buf, "%5s %12lld %12lld %d", version, &nelgt, &nelgv, &nv);
+  sscanf(buf, "%5s %12lld %12lld %u", version, &nelgt, &nelgv, &nv);
 
   // TODO: Assert version
-  int nelt = nelgt / size, nrem = nelgt - nelt * size;
+  uint nelt = nelgt / size, nrem = nelgt - nelt * size;
   nelt += (rank > (size - 1 - nrem) ? 1 : 0);
 
   if (*nv_ != 0) {
@@ -303,10 +298,6 @@ static int read_connectivity(unsigned int *nelt_, unsigned *nv_,
     MPI_Abort(comm, 911);
   }
 
-  slong out[2][1], bfr[2][1], in = nelt;
-  comm_scan(out, c, gs_long, gs_add, &in, 1, bfr);
-  slong start = out[0][0];
-
   size_t read_size = nelt * (nv + 1) * sizeof(int);
   size_t header_size = GC_CO2_HEADER_LEN + sizeof(float);
   if (rank == 0)
@@ -318,19 +309,18 @@ static int read_connectivity(unsigned int *nelt_, unsigned *nv_,
 
   char *buf0 = buf + (rank == 0) * header_size;
   long long *vl = *vl_ = tcalloc(long long, nv *nelt);
-  int j, tmp1, tmp2;
+  int tmp1, tmp2;
   for (uint i = 0; i < nelt; i++) {
     READ_T(&tmp1, buf0, int, 1);
     buf0 += sizeof(int);
-    for (j = 0; j < nv; j++) {
+    for (unsigned j = 0; j < nv; j++) {
       READ_T(&tmp2, buf0, int, 1);
       buf0 += sizeof(int);
       vl[i * nv + j] = tmp2;
     }
   }
 
-  if (buf)
-    free(buf);
+  free(buf);
 
   return 0;
 }
@@ -347,7 +337,7 @@ int parrsb_read_mesh(unsigned *nel, unsigned *nv, long long **vl,
 
   // Read geometry from .re2 file
   if (read & 1) {
-    char geom_name[BUFSIZ];
+    char geom_name[BUFSIZ + 1];
     strncpy(geom_name, name, BUFSIZ);
     strncat(geom_name, ".re2", 5);
     read_geometry(nel, nv, coord, nbcs, bcs, geom_name, &c);
@@ -355,7 +345,7 @@ int parrsb_read_mesh(unsigned *nel, unsigned *nv, long long **vl,
 
   // Read connectivity from .co2 file if the user asks us to read it.
   if (read & 2) {
-    char conn_name[BUFSIZ];
+    char conn_name[BUFSIZ + 1];
     strncpy(conn_name, name, BUFSIZ);
     strncat(conn_name, ".co2", 5);
     read_connectivity(nel, nv, vl, conn_name, &c);
@@ -375,7 +365,7 @@ int parrsb_dump_con(char *name, unsigned nelt, unsigned nv, long long *vl,
   comm_init(&c, comm);
   uint id = c.id;
 
-  char co2_name[BUFSIZ];
+  char co2_name[BUFSIZ + 1];
   strncpy(co2_name, name, BUFSIZ);
   strncat(co2_name, ".co2", 5);
 
@@ -413,12 +403,11 @@ int parrsb_dump_con(char *name, unsigned nelt, unsigned nv, long long *vl,
     buf0 += sizeof(float);
   }
 
-  int i, j, temp;
-  for (i = 0; i < nelt; i++) {
-    temp = start + i + 1;
+  for (unsigned i = 0; i < nelt; i++) {
+    int temp = start + i + 1;
     WRITE_INT(buf0, temp);
     buf0 += sizeof(int);
-    for (j = 0; j < nv; j++) {
+    for (unsigned j = 0; j < nv; j++) {
       temp = vl[i * nv + j];
       WRITE_INT(buf0, temp);
       buf0 += sizeof(int);
@@ -442,7 +431,7 @@ int parrsb_dump_map(char *name, unsigned nelt, unsigned nv, long long *vtx,
   char version[6] = "#v001";
   float test = 6.54321;
 
-  char ma2_name[BUFSIZ];
+  char ma2_name[BUFSIZ + 1];
   strncpy(ma2_name, name, BUFSIZ);
   strncat(ma2_name, ".ma2", 5);
 
@@ -514,64 +503,11 @@ int parrsb_dump_map(char *name, unsigned nelt, unsigned nv, long long *vtx,
   errs += (err != 0);
 
   MPI_Info_free(&infoIn);
-  if (buf)
-    free(buf);
+  free(buf);
 
   return errs;
 }
 
-int parrsb_dump_part(char *name, unsigned nel, unsigned nv, double *coord,
-                     int gid, MPI_Comm comm) {
-  struct comm c;
-  comm_init(&c, comm);
-
-  int rank = c.id, size = c.np;
-
-  MPI_File file;
-  int err = MPI_File_open(comm, name, MPI_MODE_CREATE | MPI_MODE_WRONLY,
-                          MPI_INFO_NULL, &file);
-  parrsb_check_error(err, comm);
-
-  slong out[2][1], buf[2][1], nelt = nel;
-  comm_scan(out, &c, gs_long, gs_add, &nelt, 1, buf);
-  slong start = out[0][0], nelgt = out[1][0];
-
-  int ndim = (nv == 8) ? 3 : 2;
-  uint wsize = (ndim * sizeof(double) + sizeof(int)) * nelt;
-  if (rank == 0)
-    wsize += sizeof(slong) + sizeof(int); // for nelgt and ndim
-
-  char *pbuf, *pbuf0;
-  pbuf = pbuf0 = (char *)tcalloc(char, wsize);
-  if (rank == 0) {
-    WRITE_T(pbuf0, &nelgt, slong, 1);
-    WRITE_T(pbuf0, &ndim, int, 1);
-  }
-
-  uint i, j, k;
-  double tcoord[3];
-  for (i = 0; i < nelt; i++) {
-    tcoord[0] = tcoord[1] = tcoord[2] = 0.0;
-    for (j = 0; j < nv; j++)
-      for (k = 0; k < ndim; k++)
-        tcoord[k] += coord[i * nv * ndim + j * ndim + k];
-    tcoord[0] /= nv, tcoord[1] /= nv, tcoord[2] /= nv;
-    WRITE_T(pbuf0, tcoord, double, ndim);
-    WRITE_T(pbuf0, &gid, int, 1);
-  }
-
-  MPI_Status st;
-  err = MPI_File_write_ordered(file, pbuf, wsize, MPI_BYTE, &st);
-  parrsb_check_error(err, comm);
-
-  err += MPI_File_close(&file);
-  parrsb_check_error(err, comm);
-
-  free(pbuf);
-
-  return err;
-}
-
 #undef check_call
 #undef check_mpi_call
 
diff --git a/src/laplacian.c b/src/laplacian.c
index db0b84ba..34067751 100644
--- a/src/laplacian.c
+++ b/src/laplacian.c
@@ -18,8 +18,9 @@ struct csr_laplacian {
 };
 
 static void find_nbrs_rsb(struct array *arr, const struct rsb_element *elems,
-                          const uint nelt, const int nv, const struct comm *c,
-                          struct crystal *cr, buffer *buf) {
+                          const uint nelt, const unsigned nv,
+                          const struct comm *c, struct crystal *cr,
+                          buffer *buf) {
   slong out[2][1], bfr[2][1], in = nelt;
   comm_scan(out, c, gs_long, gs_add, &in, 1, bfr);
   ulong eid = out[0][0] + 1;
@@ -157,7 +158,8 @@ struct gs_laplacian {
 };
 
 static int gs_weighted_init(struct laplacian *l, struct rsb_element *elems,
-                            uint lelt, int nv, struct comm *c, buffer *buf) {
+                            const uint lelt, const unsigned nv, struct comm *c,
+                            buffer *buf) {
 
   uint npts = nv * lelt;
   slong *vertices = tcalloc(slong, npts);
@@ -188,10 +190,9 @@ static int gs_weighted_init(struct laplacian *l, struct rsb_element *elems,
   return 0;
 }
 
-static int gs_weighted(scalar *v, struct laplacian *l, scalar *u, buffer *buf) {
+static int gs_weighted(scalar *v, struct laplacian *l, scalar *u, buffer *bfr) {
   uint lelt = l->nel;
-  int nv = l->nv;
-
+  unsigned nv = l->nv;
   struct gs_laplacian *gl = l->data;
 
   uint i, j;
@@ -199,7 +200,7 @@ static int gs_weighted(scalar *v, struct laplacian *l, scalar *u, buffer *buf) {
     for (j = 0; j < nv; j++)
       gl->u[nv * i + j] = u[i];
 
-  gs(gl->u, gs_double, gs_add, 0, gl->gsh, buf);
+  gs(gl->u, gs_double, gs_add, 0, gl->gsh, bfr);
 
   for (i = 0; i < lelt; i++) {
     v[i] = gl->diag[i] * u[i];
diff --git a/src/mat.c b/src/mat.c
index 34c27c21..37bd401c 100644
--- a/src/mat.c
+++ b/src/mat.c
@@ -21,39 +21,43 @@ int compress_nbrs(struct array *eij, struct array *nbr, buffer *bfr) {
     return 1;
 
   sarray_sort_2(struct nbr, nbr->ptr, nbr->n, r, 1, c, 1, bfr);
-  struct nbr *ptr = (struct nbr *)nbr->ptr;
 
-  struct mij m;
-  m.idx = 0;
-
-  sint i = 0;
-  while (i < nbr->n) {
-    m.r = ptr[i].r, m.c = ptr[i].c;
-
-    sint j = i + 1;
-    while (j < nbr->n && ptr[j].r == ptr[i].r && ptr[j].c == ptr[i].c)
-      j++;
-
-    m.v = i - j; // = - (j - i)
-    array_cat(struct mij, eij, &m, 1);
-    i = j;
+  // Set off diagonal entries.
+  {
+    const struct nbr *const ptr = (const struct nbr *const)nbr->ptr;
+    struct mij m = {.idx = 0};
+    uint i = 0;
+    while (i < nbr->n) {
+      m.r = ptr[i].r, m.c = ptr[i].c;
+
+      uint j = i + 1;
+      while (j < nbr->n && ptr[j].r == ptr[i].r && ptr[j].c == ptr[i].c)
+        j++;
+
+      m.v = j - i, m.v = -m.v;
+      array_cat(struct mij, eij, &m, 1);
+      i = j;
+    }
   }
 
   // Now make sure the row sum is zero
-  struct mij *pe = (struct mij *)eij->ptr;
-  i = 0;
-  while (i < eij->n) {
-    sint j = i, k = -1, s = 0;
-    while (j < eij->n && pe[j].r == pe[i].r) {
-      if (pe[j].r == pe[j].c)
-        k = j;
-      else
-        s += pe[j].v;
-      j++;
+  {
+    struct mij *const pe = (struct mij *const)eij->ptr;
+    uint i = 0;
+    while (i < eij->n) {
+      uint j = i;
+      sint k = -1, s = 0;
+      while (j < eij->n && pe[j].r == pe[i].r) {
+        if (pe[j].r == pe[j].c)
+          k = j;
+        else
+          s += pe[j].v;
+        j++;
+      }
+      assert(k >= 0);
+      pe[k].v = -s;
+      i = j;
     }
-    assert(k >= 0);
-    pe[k].v = -s;
-    i = j;
   }
 
   return 0;
@@ -201,42 +205,48 @@ int mat_free(struct mat *mat) {
 // Find neighbors in the graph
 //
 void find_nbrs(struct array *arr, const ulong *eid, const slong *vtx,
-               const uint nelt, const int nv, struct crystal *cr, buffer *buf) {
-  struct array vertices;
-  array_init(struct nbr, &vertices, nelt * nv);
-
+               const uint nelt, const unsigned nv, struct crystal *cr,
+               buffer *buf) {
   struct comm *c = &cr->comm;
-  struct nbr v = {.r = 0, .c = 0, .proc = 0};
-  uint i, j;
-  for (i = 0; i < nelt; i++) {
-    v.r = eid[i];
-    assert(v.r > 0);
-    for (j = 0; j < nv; j++) {
-      v.c = vtx[i * nv + j], v.proc = v.c % c->np;
-      array_cat(struct nbr, &vertices, &v, 1);
+
+  struct array vertices;
+  {
+    array_init(struct nbr, &vertices, nelt * nv);
+    struct nbr v = {.r = 0, .c = 0, .proc = 0};
+    uint i, j;
+    for (i = 0; i < nelt; i++) {
+      v.r = eid[i];
+      assert(v.r > 0);
+      for (j = 0; j < nv; j++) {
+        v.c = vtx[i * nv + j], v.proc = v.c % c->np;
+        array_cat(struct nbr, &vertices, &v, 1);
+      }
     }
   }
 
   sarray_transfer(struct nbr, &vertices, proc, 1, cr);
   sarray_sort(struct nbr, vertices.ptr, vertices.n, c, 1, buf);
 
-  // FIXME: Assumes quads or hexes
-  struct nbr *pv = (struct nbr *)vertices.ptr, t = {.r = 0, .c = 0, .proc = 0};
   array_init(struct nbr, arr, vertices.n * 10 + 1);
-  uint s = 0, e;
-  while (s < vertices.n) {
-    e = s + 1;
-    while (e < vertices.n && pv[s].c == pv[e].c)
-      e++;
-    for (i = s; i < e; i++) {
-      t = pv[i];
-      for (j = s; j < e; j++) {
-        t.c = pv[j].r;
-        assert(t.r > 0 && t.c > 0);
-        array_cat(struct nbr, arr, &t, 1);
+  // FIXME: Assumes quads or hexes
+  {
+    const struct nbr *const pv = (const struct nbr *const)vertices.ptr;
+    struct nbr t = {.r = 0, .c = 0, .proc = 0};
+    uint s = 0, e;
+    while (s < vertices.n) {
+      e = s + 1;
+      while (e < vertices.n && pv[s].c == pv[e].c)
+        e++;
+      for (uint i = s; i < e; i++) {
+        t = pv[i];
+        for (uint j = s; j < e; j++) {
+          t.c = pv[j].r;
+          assert(t.r > 0 && t.c > 0);
+          array_cat(struct nbr, arr, &t, 1);
+        }
       }
+      s = e;
     }
-    s = e;
   }
 
   sarray_transfer(struct nbr, arr, proc, 1, cr);
@@ -710,38 +720,42 @@ static int compress_mij(struct array *eij, struct array *entries, buffer *bfr) {
     return 1;
 
   sarray_sort_2(struct mij, entries->ptr, entries->n, r, 1, c, 1, bfr);
-  struct mij *ptr = (struct mij *)entries->ptr;
 
-  struct mij m;
-  m.idx = 0;
+  {
+    struct mij m = {.idx = 0};
 
-  uint i = 0;
-  while (i < entries->n) {
-    m = ptr[i];
-    uint j = i + 1;
-    while (j < entries->n && ptr[j].r == ptr[i].r && ptr[j].c == ptr[i].c)
-      m.v += ptr[j].v, j++;
+    const struct mij *const ptr = (const struct mij *const)entries->ptr;
+    uint i = 0;
+    while (i < entries->n) {
+      m = ptr[i];
+      uint j = i + 1;
+      while (j < entries->n && ptr[j].r == ptr[i].r && ptr[j].c == ptr[i].c)
+        m.v += ptr[j].v, j++;
 
-    array_cat(struct mij, eij, &m, 1);
-    i = j;
+      array_cat(struct mij, eij, &m, 1);
+      i = j;
+    }
   }
 
   // Now make sure the row sum is zero
-  struct mij *pe = (struct mij *)eij->ptr;
-  i = 0;
-  while (i < eij->n) {
-    sint j = i, k = -1;
-    scalar s = 0;
-    while (j < eij->n && pe[j].r == pe[i].r) {
-      if (pe[j].r == pe[j].c)
-        k = j;
-      else
-        s += pe[j].v;
-      j++;
+  {
+    struct mij *const pe = (struct mij *const)eij->ptr;
+    uint i = 0;
+    while (i < eij->n) {
+      uint j = i;
+      sint k = -1;
+      scalar s = 0;
+      while (j < eij->n && pe[j].r == pe[i].r) {
+        if (pe[j].r == pe[j].c)
+          k = j;
+        else
+          s += pe[j].v;
+        j++;
+      }
+      assert(k >= 0);
+      pe[k].v = -s;
+      i = j;
     }
-    assert(k >= 0);
-    pe[k].v = -s;
-    i = j;
   }
 
   return 0;
diff --git a/src/mat.h b/src/mat.h
index e1da9bd5..2d9d4ccd 100644
--- a/src/mat.h
+++ b/src/mat.h
@@ -43,7 +43,8 @@ int IS_DIAG(const struct par_mat *A);
 
 // Output array `arr` is an array of type `struct nbr`
 void find_nbrs(struct array *arr, const ulong *eid, const slong *vtx,
-               const uint nelt, const int nv, struct crystal *cr, buffer *buf);
+               const uint nelt, const unsigned nv, struct crystal *cr,
+               buffer *buf);
 // Output array `eij` is an array of type `struct mij`, input array `nbr` is
 // an array of type `struct nbr`
 int compress_nbrs(struct array *eij, struct array *nbr, buffer *bfr);
diff --git a/src/metrics.c b/src/metrics.c
index 26c85fcc..ac3f2df6 100644
--- a/src/metrics.c
+++ b/src/metrics.c
@@ -33,9 +33,9 @@ void metric_toc(struct comm *c, metric m) {
 }
 
 double metric_get_value(int level, metric m) {
-  if (level == -1)
+  if (level < 0)
     return metrics[m];
-  if (level >= 0 && level < stack_size)
+  if ((uint)level < stack_size)
     return stack[level * MAXMETS + m];
   return 0.0;
 }
@@ -93,22 +93,19 @@ void metric_rsb_print(struct comm *c, int profile_level) {
              SUMMARY(i, RSB_LANCZOS));
       printf("      RSB_LANCZOS_TQLI       : %e/%e/%e\n",
              SUMMARY(i, RSB_LANCZOS_TQLI));
-      printf("      RSB_INVERSE_SETUP      : %e/%e/%e\n",
-             SUMMARY(i, RSB_INVERSE_SETUP));
-      printf("      RSB_INVERSE            : %e/%e/%e\n",
-             SUMMARY(i, RSB_INVERSE));
-      printf("      RSB_PROJECT_AX         : %e/%e/%e\n",
-             SUMMARY(i, RSB_PROJECT_AX));
-      printf("      RSB_PROJECT_MG         : %e/%e/%e\n",
-             SUMMARY(i, RSB_PROJECT_MG));
       printf("    RSB_FIEDLER_CALC_NITER   : %e/%e/%e\n",
              SUMMARY(i, RSB_FIEDLER_CALC_NITER));
       printf("  RSB_SORT                   : %e/%e/%e\n", SUMMARY(i, RSB_SORT));
-      printf("  RSB_REPAIR                 : %e/%e/%e\n",
-             SUMMARY(i, RSB_REPAIR));
+      printf("  RSB_COMPONENTS             : %e/%e/%e\n",
+             SUMMARY(i, RSB_COMPONENTS));
+      printf("    RSB_COMPONENTS_NCOMP     : %e/%e/%e\n",
+             SUMMARY(i, RSB_COMPONENTS_NCOMP));
+      printf("  RSB_NEIGHBORS              : %e/%e/%e\n",
+             SUMMARY(i, RSB_NEIGHBORS));
       printf("  RSB_BALANCE                : %e/%e/%e\n",
              SUMMARY(i, RSB_BALANCE));
     }
+    fflush(stdout);
   }
 
   if (wrk)
@@ -148,6 +145,7 @@ void metric_crs_print(struct comm *c, int profile_level) {
       printf("  SCHUR_SOLVE_CHOL2                  : %e/%e/%e\n",
              SUMMARY(i, SCHUR_SOLVE_CHOL2));
     }
+    fflush(stdout);
   }
 
   if (wrk)
diff --git a/src/metrics.h b/src/metrics.h
index 504aaa55..83d0bc3e 100644
--- a/src/metrics.h
+++ b/src/metrics.h
@@ -7,22 +7,24 @@
 // Metrics
 //
 typedef enum {
-  RSB_COMPONENTS = 0,
+  RSB_BALANCE = 0,
+  RSB_COMPONENTS,
+  RSB_COMPONENTS_NCOMP,
   RSB_FIEDLER,
   RSB_FIEDLER_SETUP,
   RSB_FIEDLER_CALC,
   RSB_FIEDLER_CALC_NITER,
+  RSB_INVERSE_SETUP,
+  RSB_INVERSE,
   RSB_LANCZOS_SETUP,
   RSB_LANCZOS,
   RSB_LANCZOS_TQLI,
-  RSB_INVERSE_SETUP,
+  RSB_NEIGHBORS,
+  RSB_PRE,
   RSB_PROJECT_AX,
   RSB_PROJECT_MG,
-  RSB_INVERSE,
-  RSB_SORT,
-  RSB_PRE,
   RSB_REPAIR,
-  RSB_BALANCE,
+  RSB_SORT,
   SCHUR_PROJECT_NITER,
   SCHUR_PROJECT_OPERATOR,
   SCHUR_PROJECT_OPERATOR_FXI,
diff --git a/src/multigrid.c b/src/multigrid.c
index 3cd59e1b..b6e859a0 100644
--- a/src/multigrid.c
+++ b/src/multigrid.c
@@ -1,6 +1,10 @@
 #include "multigrid.h"
 #include <math.h>
 
+#ifndef M_PI
+#define M_PI 3.141592653589793
+#endif
+
 struct mg_lvl {
   uint npres, nposts;
   scalar over;
@@ -8,13 +12,10 @@ struct mg_lvl {
 
   struct gs_data *Q; // gs handle for matrix vector product
   struct par_mat *M; // Operator
-
-  struct gs_data *Qs, *Qst; // gs handle for matrix vector product
-  struct par_mat *S, *St;   // Smooth aggregation
 };
 
 struct mg {
-  uint sagg, nlevels, *level_off;
+  uint nlevels, *level_off;
   struct mg_lvl **levels;
   scalar *buf;
 };
@@ -29,7 +30,7 @@ static scalar sigma_cheb(int k, int n, scalar lmin, scalar lmax) {
   return 1 / lamk;
 }
 
-static void inline set_proc(struct mij *m, uint nelt, uint nrem, uint np) {
+inline static void set_proc(struct mij *m, uint nelt, uint nrem, uint np) {
   assert(m->r > 0);
 
   if (nrem == 0) {
@@ -43,129 +44,99 @@ static void inline set_proc(struct mij *m, uint nelt, uint nrem, uint np) {
       m->p = s + (m->r - (t + 1)) / (nelt + 1);
   }
 
-  assert(m->p >= 0 && m->p < np);
+  assert(m->p < np);
 }
 
-extern int sparse_gemm(struct par_mat *WG, const struct par_mat *W,
+static int sparse_gemm(struct par_mat *WG, const struct par_mat *W,
                        const struct par_mat *G, int diag_wg, struct crystal *cr,
-                       buffer *bfr);
+                       buffer *bfr) {
+  // W is in CSR, G is in CSC; we multiply rows of W by shifting
+  // the columns of G from processor to processor. This is not scalable
+  // at all -- need to do a 2D partition of the matrices W and G.
+  assert(IS_CSR(W) && !IS_DIAG(W));
+  assert(IS_CSC(G));
+
+  // Put G into an array to transfer from processor to processor
+  struct array gij, sij;
+  array_init(struct mij, &gij, 100);
+  array_init(struct mij, &sij, 100);
+
+  struct mij m = {.r = 0, .c = 0, .idx = 0, .p = cr->comm.id, .v = 0};
+  uint i, j, je;
+  for (i = 0; i < G->cn; i++) {
+    m.c = G->cols[i];
+    for (j = G->adj_off[i], je = G->adj_off[i + 1]; j != je; j++) {
+      m.r = G->rows[G->adj_idx[j]];
+      m.v = G->adj_val[j];
+      array_cat(struct mij, &gij, &m, 1);
+    }
+  }
+  if (IS_DIAG(G)) {
+    for (i = 0; i < G->cn; i++) {
+      m.c = m.r = G->cols[i];
+      m.v = G->diag_val[i];
+      array_cat(struct mij, &gij, &m, 1);
+    }
+  }
 
-static uint mg_setup_aux(struct mg *d, const int factor, const int sagg,
-                         struct crystal *cr, struct array *mijs, buffer *bfr) {
-  uint lvl = d->nlevels;
-  struct mg_lvl *l = d->levels[lvl - 1];
+  sarray_sort_2(struct mij, gij.ptr, gij.n, c, 1, r, 1, bfr);
+  struct mij *pg = (struct mij *)gij.ptr;
+  for (i = 0; i < gij.n; i++)
+    pg[i].idx = i;
+
+  for (uint p = 0; p < cr->comm.np; p++) {
+    // Calculate dot product of each row of W with columns of G
+    for (i = 0; i < W->rn; i++) {
+      m.r = W->rows[i];
+      uint s = 0, e = 0;
+      while (s < gij.n) {
+        m.c = pg[s].c, m.v = 0;
+        for (j = W->adj_off[i], je = W->adj_off[i + 1]; j < je; j++) {
+          ulong k = W->cols[W->adj_idx[j]];
+          while (e < gij.n && pg[s].c == pg[e].c && pg[e].r < k)
+            e++;
+          if (e < gij.n && pg[s].c == pg[e].c && pg[e].r == k)
+            m.v += W->adj_val[j] * pg[e].v;
+        }
+        while (e < gij.n && pg[s].c == pg[e].c)
+          e++;
+        if (fabs(m.v) > 1e-12)
+          array_cat(struct mij, &sij, &m, 1);
+        s = e;
+      }
+    }
 
-  struct par_mat *Ml = l->M;
-  uint nnz = ((Ml->rn > 0) ? (Ml->adj_off[Ml->rn] + Ml->rn) : 0);
+    sint next = (cr->comm.id + 1) % cr->comm.np;
+    for (i = 0; i < gij.n; i++)
+      pg[i].p = next;
+    sarray_transfer(struct mij, &gij, p, 0, cr);
 
-  struct mij m = {.r = 0, .c = 0, .idx = 0, .p = 0, .v = 0};
-  array_reserve(struct mij, mijs, nnz);
+    sarray_sort(struct mij, gij.ptr, gij.n, idx, 0, bfr);
+    pg = gij.ptr;
+  }
 
-  struct comm *c = &cr->comm;
-  const double sigma = 0.65;
-  struct par_mat *M;
-  // Replace M by the following if smooth aggregation is used:
-  // S = (I - sigma * D^{-1} * Ml)
-  // M = ST * Ml * S
-  if (sagg) {
-    // This is very hacky and not optimal at all. Should be rewritten.
-    // Create S is in CSR format, with separate diagonal. Then convert
-    // to CSC with no separate diagonal in order to do the mat-vec.
-    mijs->n = 0;
-    for (uint i = 0; i < Ml->rn; i++) {
-      m.c = m.r = Ml->rows[i], m.v = 1 - sigma;
-      array_cat(struct mij, mijs, &m, 1);
-      double di = 1.0 / Ml->diag_val[i];
-      for (uint j = Ml->adj_off[i], je = Ml->adj_off[i + 1]; j < je; j++) {
-        m.c = Ml->cols[Ml->adj_idx[j]];
-        m.v = -sigma * di * Ml->adj_val[j];
-        array_cat(struct mij, mijs, &m, 1);
-      }
-    }
-    l->S = tcalloc(struct par_mat, 1);
-    par_mat_setup(l->S, mijs, 1, 1, bfr);
-    l->Qs = setup_Q(l->S, c, bfr);
+  par_csr_setup(WG, &sij, diag_wg, bfr);
+  array_free(&gij), array_free(&sij);
 
-    struct par_mat S;
-    par_csr_to_csc(&S, l->S, 0, cr, bfr);
+  return 0;
+}
 
-    // Create N = M in CSR format, no separate diagonal.
-    mijs->n = 0;
-    for (uint i = 0; i < Ml->rn; i++) {
-      m.c = m.r = Ml->rows[i], m.v = Ml->diag_val[i];
-      array_cat(struct mij, mijs, &m, 1);
-      for (uint j = Ml->adj_off[i], je = Ml->adj_off[i + 1]; j < je; j++) {
-        m.c = Ml->cols[Ml->adj_idx[j]];
-        m.v = Ml->adj_val[j];
-        array_cat(struct mij, mijs, &m, 1);
-      }
-    }
-    struct par_mat N;
-    par_mat_setup(&N, mijs, 1, 0, bfr);
-
-    // T = N * S, CSR format, no separate diagonal.
-    struct par_mat T;
-    sparse_gemm(&T, &N, &S, 0, cr, bfr);
-    par_mat_free(&N), par_mat_free(&S);
-
-    // N = T, CSC format, no separate diagonal.
-    par_csr_to_csc(&N, &T, 0, cr, bfr);
-    par_mat_free(&T);
-
-    // Setup S^t, CSR format, no separate diagonal.
-    mijs->n = 0;
-    for (uint i = 0; i < Ml->rn; i++) {
-      m.c = m.r = Ml->rows[i], m.v = 1 - sigma;
-      array_cat(struct mij, mijs, &m, 1);
-      double di = 1.0 / Ml->diag_val[i];
-      for (uint j = Ml->adj_off[i], je = Ml->adj_off[i + 1]; j < je; j++) {
-        m.r = Ml->cols[Ml->adj_idx[j]];
-        m.v = -sigma * di * Ml->adj_val[j];
-        array_cat(struct mij, mijs, &m, 1);
-      }
-    }
-    par_mat_setup(&T, mijs, 0, 0, bfr);
-    par_csc_to_csr(&S, &T, 0, cr, bfr);
-    par_mat_free(&T);
-
-    // M = ST * N
-    M = tcalloc(struct par_mat, 1);
-    sparse_gemm(M, &S, &N, 1, cr, bfr);
-    par_mat_free(&S), par_mat_free(&N);
-
-    // Normalize M by the largest value
-    double max = 0;
-    for (uint i = 0; i < M->rn; i++) {
-      for (uint j = M->adj_off[i], je = M->adj_off[i + 1]; j < je; j++)
-        if (fabs(M->adj_val[j]) > max)
-          max = fabs(M->adj_val[j]);
-      if (fabs(M->diag_val[i]) > max)
-        max = fabs(M->diag_val[i]);
-    }
-    double wrk[2];
-    comm_allreduce(c, gs_double, gs_max, &max, 1, wrk);
+static uint mg_setup_aux(struct mg *d, const int factor, struct crystal *cr,
+                         struct array *mijs, buffer *bfr) {
+  uint lvl = d->nlevels;
+  struct mg_lvl *l = d->levels[lvl - 1];
 
-    for (uint i = 0; i < M->rn; i++) {
-      for (uint j = M->adj_off[i], je = M->adj_off[i + 1]; j < je; j++)
-        M->adj_val[j] /= max;
-      M->diag_val[i] /= max;
-    }
+  struct par_mat *M = l->M;
+  uint nnz = ((M->rn > 0) ? (M->adj_off[M->rn] + M->rn) : 0);
 
-    par_mat_setup(&T, mijs, 0, 0, bfr);
-    l->St = tcalloc(struct par_mat, 1);
-    par_csc_to_csr(l->St, &T, 1, cr, bfr);
-    par_mat_free(&T);
-    l->Qst = setup_Q(l->St, c, bfr);
-  } else {
-    l->S = l->St = NULL;
-    l->Qs = l->Qst = NULL;
-    M = Ml;
-  }
+  struct mij m = {.r = 0, .c = 0, .idx = 0, .p = 0, .v = 0};
+  array_reserve(struct mij, mijs, nnz);
 
   // Now we interpolate to find the coarse operator Mc = J^T M J
   // Calculate coarse level parameters: ngc, npc, nelt, nrem
   uint size = (M->rn > 0 ? (M->rows[M->rn - 1] - M->rows[0] + 1) : 0);
   slong ng = size, wrk[2][1];
+  struct comm *c = &cr->comm;
   comm_allreduce(c, gs_long, gs_add, &ng, 1, wrk);
 
   // ng > 1 based on while condition in mg_setup(). so ngc >= 1
@@ -197,11 +168,6 @@ static uint mg_setup_aux(struct mg *d, const int factor, const int sagg,
     array_cat(struct mij, mijs, &m, 1);
   }
 
-  if (sagg) {
-    par_mat_free(M);
-    free(M);
-  }
-
   sarray_transfer(struct mij, mijs, p, 0, cr);
   sarray_sort_2(struct mij, mijs->ptr, mijs->n, r, 1, c, 1, bfr);
 
@@ -221,14 +187,13 @@ static uint mg_setup_aux(struct mg *d, const int factor, const int sagg,
   return lvl;
 }
 
-struct mg *mg_setup(const struct par_mat *M, const int factor, const int sagg,
+struct mg *mg_setup(const struct par_mat *M, const int factor,
                     struct crystal *cr, buffer *bfr) {
   assert(IS_CSR(M));
   assert(M->rn == 0 || IS_DIAG(M));
 
   // Allocate memory for struct mg
   struct mg *d = (struct mg *)tcalloc(struct mg, 1);
-  d->sagg = sagg;
 
   // Setup Level 1, keeps a pointer to input matrix
   d->nlevels = 1;
@@ -254,7 +219,7 @@ struct mg *mg_setup(const struct par_mat *M, const int factor, const int sagg,
   slong wrk[2], ng = size;
   comm_allreduce(c, gs_long, gs_add, &ng, 1, wrk);
   while (ng > 1) {
-    uint l = mg_setup_aux(d, factor, sagg, cr, &mijs, bfr);
+    uint l = mg_setup_aux(d, factor, cr, &mijs, bfr);
     struct par_mat *Ml = d->levels[l]->M;
     if (Ml->rn > 0 && Ml->adj_off[Ml->rn] + Ml->rn > nnz)
       nnz = Ml->adj_off[Ml->rn] + Ml->rn;
@@ -296,7 +261,7 @@ void mg_vcycle(scalar *u1, scalar *rhs, struct mg *d, struct comm *c,
   scalar *s = r + nnz, *Gs = s + nnz, *u = Gs + nnz, *wrk = u + nnz;
 
   uint i, j, n, off;
-  for (int lvl = 0; lvl < d->nlevels - 1; lvl++) {
+  for (uint lvl = 0; lvl < d->nlevels - 1; lvl++) {
     off = lvl_off[lvl];
     n = lvl_off[lvl + 1] - off;
 
@@ -331,10 +296,6 @@ void mg_vcycle(scalar *u1, scalar *rhs, struct mg *d, struct comm *c,
         r[off + j] = r[off + j] - Gs[off + j];
     }
 
-    // Apply S^T
-    if (d->sagg)
-      mat_vec_csr(r + off, r + off, l->St, l->Qst, wrk, bfr);
-
     // Interpolate to coarser level
     gs(r + off, gs_double, gs_add, 1, l->J, bfr);
   }
@@ -342,7 +303,6 @@ void mg_vcycle(scalar *u1, scalar *rhs, struct mg *d, struct comm *c,
   // Coarsest level
   off = lvl_off[d->nlevels - 1];
   n = lvl_off[d->nlevels] - off;
-
   if (n == 1) {
     struct mg_lvl *l = d->levels[d->nlevels - 1];
     struct par_mat *M = l->M;
@@ -353,16 +313,12 @@ void mg_vcycle(scalar *u1, scalar *rhs, struct mg *d, struct comm *c,
     r[off] = u[off];
   }
 
-  for (int lvl = d->nlevels - 2; lvl >= 0; lvl--) {
+  for (int lvl = (int)d->nlevels - 2; lvl >= 0; lvl--) {
     struct mg_lvl *l = d->levels[lvl];
     off = lvl_off[lvl];
     // J*e
     gs(r + off, gs_double, gs_add, 0, l->J, bfr);
 
-    // Apply S
-    if (d->sagg)
-      mat_vec_csr(r + off, r + off, l->S, l->Qs, wrk, bfr);
-
     // u = u + over*S*J*e
     n = lvl_off[lvl + 1] - off;
     for (j = 0; j < n; j++)
@@ -384,14 +340,6 @@ void mg_free(struct mg *d) {
         gs_free(l[i]->J), l[i]->J = NULL;
       if (l[i]->Q != NULL)
         gs_free(l[i]->Q), l[i]->Q = NULL;
-      if (l[i]->Qs != NULL)
-        gs_free(l[i]->Qs), l[i]->Qs = NULL;
-      if (l[i]->Qst != NULL)
-        gs_free(l[i]->Qst), l[i]->Qst = NULL;
-      if (l[i]->S != NULL)
-        par_mat_free(l[i]->S), l[i]->S = NULL;
-      if (l[i]->St != NULL)
-        par_mat_free(l[i]->St), l[i]->St = NULL;
       if (l[i] != NULL)
         free(l[i]), l[i] = NULL;
     }
diff --git a/src/multigrid.h b/src/multigrid.h
index 07f4e61e..72c9f961 100644
--- a/src/multigrid.h
+++ b/src/multigrid.h
@@ -4,7 +4,7 @@
 #include "mat.h"
 
 struct mg;
-struct mg *mg_setup(const struct par_mat *M, const int factor, const int sagg,
+struct mg *mg_setup(const struct par_mat *M, const int factor,
                     struct crystal *cr, buffer *bfr);
 void mg_vcycle(scalar *u, scalar *rhs, struct mg *d, struct comm *c,
                buffer *bfr);
diff --git a/src/parRSB.h b/src/parRSB.h
index ec3fa119..7a71ca6f 100644
--- a/src/parRSB.h
+++ b/src/parRSB.h
@@ -20,36 +20,44 @@ extern "C" {
 //
 typedef struct {
   // General options
-  int partitioner;   // Partition algo: 0 - RSB, 1 - RCB, 2 - RIB (Default: 0)
-  int verbose_level; // Verbose level: 0, 1, 2, .. etc (Default: 1)
-  int profile_level; // Profile level: 0, 1, 2, .. etc (Default: 1)
-  int two_level;     // Enable two level partitioning (Default: 0)
+  int partitioner; // Partition algo: 0 - RSB, 1 - RCB, 2 - RIB (Default: 0)
+  int tagged;      // Tagged partitioning: 0 - No, 1 - Yes (Default: 0)
+  int levels;      // Number of levels (levels: 1, 2)
   int repair; // Repair disconnected components: 0 - No, 1 - Yes (Default: 0)
-  // RSB common (Lanczos + MG) options
+  int verbose_level; // Verbose level: 0, 1, 2, .. etc (Default: 1)
+  int profile_level; // Profile level: 0, 1, 2, .. etc (Default: 0)
+  // RSB common (Lanczos and MG) options
   int rsb_algo; // RSB algo: 0 - Lanczos, 1 - MG (Default: 0)
   int rsb_pre;  // RSB pre-partition : 0 - None, 1 - RCB , 2 - RIB (Default: 1)
   int rsb_max_iter;   // Max iterations in Lanczos / MG (Default: 50)
   int rsb_max_passes; // Max Lanczos restarts / Inverse iterations (Default: 50)
   double rsb_tol;     // Tolerance for Lanczos or RQI (Default: 1e-5)
+  int rsb_dump_stats; // Dump partition statistics to a text file.
   // RSB MG specific options
   int rsb_mg_grammian; // MG Grammian: 0 or 1 (Default: 0)
   int rsb_mg_factor;   // MG Coarsening factor (Default: 2, should be > 1)
-  int rsb_mg_sagg;     // MG smooth aggregation: 0 or 1 (Default: 0)
 } parrsb_options;
 
 extern parrsb_options parrsb_default_options;
 
-int parrsb_part_mesh(int *part, int *seq, long long *vtx, double *coord,
-                     int nel, int nv, parrsb_options options, MPI_Comm comm);
+int parrsb_part_mesh(int *part, const long long *const vtx,
+                     const double *const xyz, const int *const tag,
+                     const int nel, const int nv, parrsb_options *const options,
+                     MPI_Comm comm);
 
-#define fparrsb_part_mesh FORTRAN_UNPREFIXED(fparrsb_partmesh, FPARRSB_PARTMESH)
-void fparrsb_part_mesh(int *part, int *seq, long long *vtx, double *coord,
-                       int *nel, int *nve, int *options, int *comm, int *err);
+void parrsb_part_solid(int *part, const long long *vtx2, unsigned nel2,
+                       const long long *vtx1, unsigned nel1, unsigned nv,
+                       MPI_Comm comm);
 
+void parrsb_check_tagged_partitions(const long long *const eids,
+                                    const long long *const vtx, const uint nel,
+                                    const unsigned nv, const uint ntags,
+                                    const struct comm *const c,
+                                    const int verbose);
 //==============================================================================
 // Connectivity
 //
-int parrsb_conn_mesh(long long *vtx, double *coord, int nel, int nDim,
+int parrsb_conn_mesh(long long *vtx, double *coord, uint nel, unsigned nDim,
                      long long *periodicInfo, int nPeriodicFaces, double tol,
                      MPI_Comm comm);
 
@@ -72,9 +80,6 @@ int parrsb_dump_con(char *name, unsigned nelt, unsigned nv, long long *vl,
 int parrsb_dump_map(char *name, unsigned nelt, unsigned nv, long long *vl,
                     MPI_Comm comm);
 
-int parrsb_dump_part(char *name, unsigned nelt, unsigned nv, double *coord,
-                     int gid, MPI_Comm comm);
-
 //==============================================================================
 // Auxiliary functions
 //
@@ -85,13 +90,6 @@ typedef struct {
   int dump;    // dump the connectivity or map file, default: 1
   int nactive; // # of active MPI ranks, default: INT_MAX
   int verbose; // Verbosity, default: 0
-
-  int ilu_type;   // ILU type, default: 0
-  double ilu_tol; // ILU tolerance, default: 0.1
-  int ilu_pivot;  // Pivoting for ILU: default: 0
-
-  int crs_type;   // Coarse solver type, default: 0
-  double crs_tol; // Coarse tolerance, default: 1e-3
 } parrsb_cmd_line_opts;
 
 parrsb_cmd_line_opts *parrsb_parse_cmd_opts(int argc, char *argv[]);
@@ -99,7 +97,7 @@ parrsb_cmd_line_opts *parrsb_parse_cmd_opts(int argc, char *argv[]);
 void parrsb_cmd_opts_free(parrsb_cmd_line_opts *opts);
 
 int parrsb_dist_mesh(unsigned *nelt, long long **vl, double **coord, int *part,
-                     int nv, MPI_Comm comm);
+                     unsigned nv, MPI_Comm comm);
 
 int parrsb_setup_mesh(unsigned *nelt, unsigned *nv, long long **vl,
                       double **coord, parrsb_cmd_line_opts *opts,
diff --git a/src/parrsb-impl.h b/src/parrsb-impl.h
index e34319e2..f792dbb1 100644
--- a/src/parrsb-impl.h
+++ b/src/parrsb-impl.h
@@ -1,15 +1,12 @@
 #ifndef _PARRSB_IMPL_H_
 #define _PARRSB_IMPL_H_
 
-#include "parRSB.h"
-#include <assert.h>
+#define _POSIX_C_SOURCE 200809L
+
 #include <float.h>
-#include <limits.h>
-#include <math.h>
-#include <stddef.h>
-#include <stdio.h>
 #include <stdlib.h>
-#include <time.h>
+
+#include "parRSB.h"
 
 #ifdef scalar
 #undef scalar
@@ -22,15 +19,14 @@
 #define SCALAR_MAX DBL_MAX
 #define SCALAR_TOL 1e-12
 
-#define MAXDIM 3 // Maximum dimension of the mesh
-#define MAXNV 8  // Maximum number of vertices per element
+#define MAXDIM 3 // Maximum dimension of the mesh.
+#define MAXNV 8  // Maximum number of vertices per element.
 
 //------------------------------------------------------------------------------
-// RCB / RIB
+// RCB / RIB.
 // `struct rcb_element` is used for RCB and RIB partitioning.
-// `struct rsb_element` should be a superset of `struct rcb_element`
 struct rcb_element {
-  uint proc, origin, seq;
+  uint proc, origin;
   ulong globalId;
   scalar coord[MAXDIM], fiedler;
 };
@@ -41,17 +37,20 @@ int rib(struct array *elements, size_t unit_size, int ndim, struct comm *c,
         buffer *bfr);
 
 //------------------------------------------------------------------------------
-// RSB
-//
+// RSB.
+// `struct rsb_element` = `struct rcb_element` + vertices. Order is important.
 struct rsb_element {
-  uint proc, origin, seq;
+  uint proc, origin;
   ulong globalId;
   scalar coord[MAXDIM], fiedler;
   slong vertices[MAXNV];
 };
 
+void rsb(struct array *elements, int nv, const parrsb_options *const options,
+         const struct comm comms[3], buffer *bfr);
+
 //------------------------------------------------------------------------------
-// Find number of components
+// Find number of components.
 //
 uint get_components(sint *component, struct array *elems, unsigned nv,
                     struct comm *c, buffer *buf, int verbose);
@@ -59,7 +58,21 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv,
                        const struct comm *ci, buffer *bfr, int verbose);
 
 //------------------------------------------------------------------------------
-// Laplacian
+// Dump partition statistics.
+//
+void parrsb_dump_stats_start(const uint nv_);
+
+void parrsb_dump_stats(const struct comm *const gc, const struct comm *const lc,
+                       const struct array *const elems, buffer *bfr);
+
+void parrsb_dump_stats_end(const struct comm *const gc, const char *prefix);
+
+uint parrsb_get_neighbors(const struct array *const elems, const unsigned nv,
+                          const struct comm *const gc,
+                          const struct comm *const lc, buffer *bfr);
+
+//------------------------------------------------------------------------------
+// Laplacian.
 //
 #define GS 1
 #define CSR 2
@@ -72,11 +85,11 @@ int laplacian(scalar *v, struct laplacian *l, scalar *u, buffer *buf);
 void laplacian_free(struct laplacian *l);
 
 //------------------------------------------------------------------------------
-// Misc
+// Misc.
 //
 int log2ll(long long n);
 
 void parrsb_barrier(struct comm *c);
 
-void debug_print(struct comm *c, int verbose, const char *fmt, ...);
+void parrsb_print(const struct comm *c, int verbose, const char *fmt, ...);
 #endif
diff --git a/src/parrsb.c b/src/parrsb.c
new file mode 100644
index 00000000..d68dec00
--- /dev/null
+++ b/src/parrsb.c
@@ -0,0 +1,972 @@
+#include "metrics.h"
+#include "parrsb-impl.h"
+
+#include <ctype.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+void parrsb_print(const struct comm *c, int verbose, const char *fmt, ...) {
+  comm_barrier(c);
+
+  va_list vargs;
+  if (c->id == 0 && verbose > 0) {
+    va_start(vargs, fmt);
+    vprintf(fmt, vargs);
+    va_end(vargs);
+    printf("\n");
+    fflush(stdout);
+  }
+}
+
+parrsb_options parrsb_default_options = {
+    // General options
+    .partitioner = 0,
+    .tagged = 0,
+    .levels = 2,
+    .repair = 0,
+    .verbose_level = 1,
+    .profile_level = 0,
+    // RSB common (Lanczos and MG) options
+    .rsb_algo = 0,
+    .rsb_pre = 1,
+    .rsb_max_iter = 50,
+    .rsb_max_passes = 50,
+    .rsb_tol = 1e-5,
+    .rsb_dump_stats = 0,
+    // RSB MG specific options
+    .rsb_mg_grammian = 0,
+    .rsb_mg_factor = 2};
+
+static char *ALGO[3] = {"RSB", "RCB", "RIB"};
+
+static void update_options(parrsb_options *const options) {
+#define UPDATE_OPTION(OPT, STR, IS_INT)                                        \
+  do {                                                                         \
+    const char *val = getenv(STR);                                             \
+    if (val != NULL) {                                                         \
+      if (IS_INT)                                                              \
+        options->OPT = atoi(val);                                              \
+      else                                                                     \
+        options->OPT = atof(val);                                              \
+    }                                                                          \
+  } while (0)
+
+  UPDATE_OPTION(partitioner, "PARRSB_PARTITIONER", 1);
+  UPDATE_OPTION(tagged, "PARRSB_TAGGED", 1);
+  UPDATE_OPTION(levels, "PARRSB_LEVELS", 1);
+  UPDATE_OPTION(repair, "PARRSB_REPAIR", 1);
+  UPDATE_OPTION(verbose_level, "PARRSB_VERBOSE_LEVEL", 1);
+  UPDATE_OPTION(profile_level, "PARRSB_PROFILE_LEVEL", 1);
+  UPDATE_OPTION(rsb_algo, "PARRSB_RSB_ALGO", 1);
+  UPDATE_OPTION(rsb_pre, "PARRSB_RSB_PRE", 1);
+  UPDATE_OPTION(rsb_max_iter, "PARRSB_RSB_MAX_ITER", 1);
+  UPDATE_OPTION(rsb_max_passes, "PARRSB_RSB_MAX_PASSES", 1);
+  UPDATE_OPTION(rsb_tol, "PARRSB_RSB_TOL", 0);
+  UPDATE_OPTION(rsb_dump_stats, "PARRSB_DUMP_STATS", 1);
+  UPDATE_OPTION(rsb_mg_grammian, "PARRSB_RSB_MG_GRAMMIAN", 1);
+  UPDATE_OPTION(rsb_mg_factor, "PARRSB_RSB_MG_FACTOR", 1);
+
+#undef UPDATE_OPTION
+}
+
+static void print_options(const struct comm *c,
+                          const parrsb_options *const options) {
+#define PRINT_OPTION(OPT, STR, FMT)                                            \
+  parrsb_print(c, options->verbose_level, "%s = " FMT "", STR, options->OPT)
+
+  PRINT_OPTION(partitioner, "PARRSB_PARTITIONER", "%d");
+  PRINT_OPTION(tagged, "PARRSB_TAGGED", "%d");
+  PRINT_OPTION(levels, "PARRSB_LEVELS", "%d");
+  PRINT_OPTION(repair, "PARRSB_REPAIR", "%d");
+  PRINT_OPTION(verbose_level, "PARRSB_VERBOSE_LEVEL", "%d");
+  PRINT_OPTION(profile_level, "PARRSB_PROFILE_LEVEL", "%d");
+  PRINT_OPTION(rsb_algo, "PARRSB_RSB_ALGO", "%d");
+  PRINT_OPTION(rsb_pre, "PARRSB_RSB_PRE", "%d");
+  PRINT_OPTION(rsb_max_iter, "PARRSB_RSB_MAX_ITER", "%d");
+  PRINT_OPTION(rsb_max_passes, "PARRSB_RSB_MAX_PASSES", "%d");
+  PRINT_OPTION(rsb_tol, "PARRSB_RSB_TOL", "%lf");
+  PRINT_OPTION(rsb_dump_stats, "PARRSB_DUMP_STATS", "%d");
+  PRINT_OPTION(rsb_mg_grammian, "PARRSB_RSB_MG_GRAMMIAN", "%d");
+  PRINT_OPTION(rsb_mg_factor, "PARRSB_RSB_MG_FACTOR", "%d");
+
+#undef PRINT_OPTION
+}
+
+static size_t load_balance(struct array *elist, uint nel, int nv,
+                           const double *const xyz, const long long *const vtx,
+                           int verbose, struct crystal *cr, buffer *bfr) {
+  struct comm *c = &cr->comm;
+  slong out[2][1], wrk[2][1], in = nel;
+  comm_scan(out, c, gs_long, gs_add, &in, 1, wrk);
+  slong start = out[0][0], nelg = out[1][0];
+  parrsb_print(c, verbose, "load_balance: start = %lld nelg = %lld", start,
+               nelg);
+
+  uint nstar = nelg / c->np, nrem = nelg - nstar * c->np;
+  slong lower = (nstar + 1) * nrem;
+
+  size_t unit_size;
+  if (vtx == NULL) // RCB
+    unit_size = sizeof(struct rcb_element);
+  else             // RSB
+    unit_size = sizeof(struct rsb_element);
+  parrsb_print(
+      c, verbose, "load_balance: unit_size = %zu (rsb = %zu, rcb = %zu)",
+      unit_size, sizeof(struct rsb_element), sizeof(struct rcb_element));
+
+  array_init_(elist, nel, unit_size, __FILE__, __LINE__);
+
+  struct rcb_element *pe = (struct rcb_element *)calloc(1, unit_size);
+  pe->origin = c->id;
+
+  int ndim = (nv == 8) ? 3 : 2;
+  for (uint e = 0; e < nel; ++e) {
+    slong eg = pe->globalId = start + e + 1;
+    if (nstar == 0)
+      pe->proc = eg - 1;
+    else if (eg <= lower)
+      pe->proc = (eg - 1) / (nstar + 1);
+    else
+      pe->proc = (eg - 1 - lower) / nstar + nrem;
+
+    pe->coord[0] = pe->coord[1] = pe->coord[2] = 0.0;
+    if (xyz != NULL) {
+      for (int v = 0; v < nv; v++)
+        for (int n = 0; n < ndim; n++)
+          pe->coord[n] += xyz[e * ndim * nv + v * ndim + n];
+      for (int n = 0; n < ndim; n++)
+        pe->coord[n] /= nv;
+    }
+
+    array_cat_(unit_size, elist, pe, 1, __FILE__, __LINE__);
+  }
+
+  if (vtx != NULL) { // RSB
+    struct rsb_element *pr = (struct rsb_element *)elist->ptr;
+    for (uint e = 0; e < nel; e++) {
+      for (int v = 0; v < nv; v++)
+        pr[e].vertices[v] = vtx[e * nv + v];
+    }
+  }
+
+  sarray_transfer_(elist, unit_size, offsetof(struct rcb_element, proc), 1, cr);
+  if (vtx == NULL) // RCB
+    sarray_sort(struct rcb_element, elist->ptr, elist->n, globalId, 1, bfr);
+  else             // RSB
+    sarray_sort(struct rsb_element, elist->ptr, elist->n, globalId, 1, bfr);
+
+  free(pe);
+
+  return unit_size;
+}
+
+static void restore_original(int *part, struct crystal *cr, struct array *elist,
+                             size_t usize, buffer *bfr) {
+  sarray_transfer_(elist, usize, offsetof(struct rcb_element, origin), 1, cr);
+  uint nel = elist->n;
+
+  if (usize == sizeof(struct rsb_element))      // RSB
+    sarray_sort(struct rsb_element, elist->ptr, nel, globalId, 1, bfr);
+  else if (usize == sizeof(struct rcb_element)) // RCB
+    sarray_sort(struct rcb_element, elist->ptr, nel, globalId, 1, bfr);
+
+  struct rcb_element *element;
+  uint e;
+  for (e = 0; e < nel; e++) {
+    element = (struct rcb_element *)((char *)elist->ptr + e * usize);
+    part[e] = element->origin; // element[e].origin;
+  }
+}
+
+static void initialize_node_aux(struct comm *c, const struct comm *const gc) {
+#ifdef MPI
+  MPI_Comm node;
+  MPI_Comm_split_type(gc->c, MPI_COMM_TYPE_SHARED, gc->id, MPI_INFO_NULL,
+                      &node);
+  comm_init(c, node);
+  MPI_Comm_free(&node);
+#else
+  comm_init(1, 1);
+#endif
+}
+
+static void initialize_levels(struct comm *const comms, int *const levels_,
+                              const struct comm *const c, const int verbose) {
+  // Level 1 communicator is the global communicator.
+  comm_dup(&comms[0], c);
+  // Node level communicator is the last level communicator.
+  struct comm nc;
+  initialize_node_aux(&nc, c);
+
+  // Find the number of nodes under the global communicator and number of MPI
+  // ranks in the node level communicator.
+  uint nnodes, nranks_per_node;
+  {
+    sint in = (nc.id == 0), wrk;
+    comm_allreduce(c, gs_int, gs_add, &in, 1, &wrk);
+    nnodes = in;
+
+    nranks_per_node = nc.np;
+    // Check invariant: nranks_per_node should be the same across all the nodes.
+    sint nranks_max = nranks_per_node, nranks_min = nranks_per_node;
+    comm_allreduce(&comms[0], gs_int, gs_max, &nranks_max, 1, &wrk);
+    comm_allreduce(&comms[0], gs_int, gs_min, &nranks_min, 1, &wrk);
+    assert(nranks_max == nranks_min);
+    // Check invariant: nranks_per_node must be larger than 0.
+    assert(nranks_per_node > 0);
+    parrsb_print(c, verbose,
+                 "initialize_levels: num_nodes = %u, num_ranks_per_node = %u",
+                 nnodes, nranks_per_node);
+  }
+
+  // Check if there are custom levels specified by the user. Size of the
+  // partition (in terms of number of nodes) in a given level must be a
+  // multiple of the partition size of the next level.
+  sint levels;
+  uint sizes[2] = {nnodes, 1};
+  {
+    const uint size_max = sizeof(sizes) / sizeof(sizes[0]);
+    uint start = 1;
+    while (start < size_max && sizes[start] >= sizes[0])
+      start++;
+    while (start < size_max && sizes[0] % sizes[start])
+      ++start;
+
+    uint level = 1;
+    for (; start < size_max; ++start, ++level)
+      sizes[level] = sizes[start];
+    // Set the size of the last partition to 1 (since it is the node level
+    // partitioner).
+    sizes[level - 1] = 1;
+
+    // Check assert: sizes should be strictly decreasing.
+    for (uint i = 1; i < level; i++)
+      assert(sizes[i - 1] > sizes[i]);
+
+    levels = level;
+  }
+
+  for (sint level = 1; level < levels - 1; ++level) {
+    comm_split(&comms[level - 1],
+               comms[level - 1].id / (sizes[level] * nranks_per_node),
+               comms[level - 1].id, &comms[level]);
+  }
+  levels = MIN(levels, *levels_);
+  if (levels > 1)
+    comm_dup(&comms[levels - 1], &nc);
+  *levels_ = levels;
+  parrsb_print(c, verbose, "initialize_levels: levels = %u", levels);
+
+  comm_free(&nc);
+}
+
+static void parrsb_part_mesh_v0(int *part, const long long *const vtx,
+                                const double *const xyz, const uint nel,
+                                const unsigned nv,
+                                parrsb_options *const options,
+                                const struct comm *const c,
+                                struct crystal *const cr, buffer *const bfr) {
+  const int verbose = options->verbose_level;
+
+  if (vtx == NULL && xyz == NULL) {
+    parrsb_print(
+        c, verbose,
+        "parrsb_part_mesh_v0: Both vertices and coordinates can't be NULL");
+    MPI_Abort(c->c, EXIT_FAILURE);
+  }
+  if (xyz == NULL)
+    options->rsb_pre = 0;
+
+  struct array elist;
+  size_t esize = load_balance(&elist, nel, nv, xyz, vtx, verbose, cr, bfr);
+
+  struct comm ca;
+  comm_split(c, elist.n > 0, c->id, &ca);
+
+  // Setup communicators for each level of the partitioning.
+  struct comm comms[9];
+  {
+    // Check invariant: levels > 0 and levels <= sizeof(comms) /
+    // sizeof(comms[0]).
+    const uint levels = options->levels;
+    assert(levels <= sizeof(comms) / sizeof(comms[0]));
+    initialize_levels(comms, &options->levels, &ca, verbose);
+    parrsb_print(c, verbose,
+                 "parrsb_part_mesh_v0: Levels:  requested = %d, enabled = %d",
+                 levels, options->levels);
+  }
+
+  parrsb_print(c, verbose, "parrsb_part_mesh_v0: running partitioner ...");
+  if (elist.n > 0) {
+    int ndim = (nv == 8) ? 3 : 2;
+    switch (options->partitioner) {
+    case 0:
+      rsb(&elist, nv, options, comms, bfr);
+      break;
+    case 1:
+      rcb(&elist, esize, ndim, &ca, bfr);
+      break;
+    case 2:
+      rib(&elist, esize, ndim, &ca, bfr);
+      break;
+    default:
+      break;
+    }
+  }
+  comm_free(&ca);
+
+  for (uint l = 0; l < (uint)options->levels; l++)
+    comm_free(&comms[l]);
+
+  parrsb_print(c, verbose, "parrsb_part_mesh_v0: restore original input");
+  restore_original(part, cr, &elist, esize, bfr);
+
+  array_free(&elist);
+}
+
+void parrsb_check_tagged_partitions(const long long *const eids,
+                                    const long long *const vtx, const uint nel,
+                                    const unsigned nv, const uint ntags,
+                                    const struct comm *const c,
+                                    const int verbose) {
+  parrsb_print(c, verbose, "Check if the input elements are sorted locally.");
+  {
+    sint sorted = 1;
+    for (uint i = 1; i < nel; i++) {
+      if (eids[i] < eids[i - 1]) {
+        sorted = 0;
+        break;
+      }
+    }
+
+    sint wrk;
+    comm_allreduce(c, gs_int, gs_min, &sorted, 1, &wrk);
+    if (!sorted) {
+      if (c->id == 0) {
+        fprintf(stderr, "Input elements are not sorted.\n");
+        fflush(stderr);
+      }
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  // Number the elements within the each tag id and setup a gs handle based on
+  // 2D element id.
+  parrsb_print(c, verbose, "Number elements within each layer.");
+  const uint tag_id = c->id / ntags;
+  struct comm lc;
+  struct gs_data *gse = NULL;
+  {
+    comm_split(c, tag_id, c->id, &lc);
+
+    slong out[2][1], wrk[2][1], in = nel;
+    comm_scan(out, &lc, gs_long, gs_add, &in, 1, wrk);
+    slong start = out[0][0];
+
+    slong *lids = tcalloc(slong, nel);
+    for (uint i = 0; i < nel; i++)
+      lids[i] = start + i;
+
+    gse = gs_setup(lids, nel, c, 0, gs_pairwise, 0);
+    free(lids);
+  }
+
+  // Setup a local gs handle based on the original gs vertex ids.
+  parrsb_print(c, verbose, "Setup multiplicity.");
+  const size_t size = nel * nv;
+  buffer bfr;
+  buffer_init(&bfr, size);
+  sint *mul = tcalloc(sint, size);
+  {
+    struct gs_data *gsl = gs_setup(vtx, size, &lc, 0, gs_pairwise, 0);
+    for (uint i = 0; i < size; i++)
+      mul[i] = 1;
+    gs(mul, gs_int, gs_add, 0, gsl, &bfr);
+    gs_free(gsl);
+  }
+
+  // Now let's compare the multiplicity across the layers.
+  parrsb_print(c, verbose, "Check multiplicity across the layers.");
+  {
+    sint *lmin = tcalloc(sint, nel);
+    sint *lmax = tcalloc(sint, nel);
+    for (uint v = 0; v < nv; v++) {
+      for (uint e = 0; e < nel; e++) {
+        lmin[e] = mul[e * nv + v];
+        lmax[e] = mul[e * nv + v];
+      }
+
+      gs(lmin, gs_int, gs_min, 0, gse, &bfr);
+      gs(lmax, gs_int, gs_max, 0, gse, &bfr);
+
+      for (uint e = 0; e < nel; e++)
+        assert(lmin[e] == lmax[e]);
+    }
+
+    free(lmin), free(lmax);
+  }
+
+  free(mul);
+  buffer_free(&bfr);
+  gs_free(gse);
+  comm_free(&lc);
+
+  return;
+}
+
+static void parrsb_part_mesh_v1(int *part, const long long *const vtx,
+                                const double *const xyz, const int *const tag,
+                                const uint nel, const unsigned nv,
+                                parrsb_options *const options,
+                                const struct comm *const c,
+                                struct crystal *const cr, buffer *const bfr) {
+  const int verbose = options->verbose_level;
+  parrsb_print(c, verbose, "Find number of tags in the mesh ...");
+
+  struct tag_t {
+    uint p, tag, seq, tagn;
+  };
+
+  struct array tags;
+  array_init(struct tag_t, &tags, nel);
+
+  {
+    struct tag_t tt;
+    for (uint i = 0; i < nel; i++) {
+      tt.seq = i, tt.tag = tag[i], tt.p = tt.tag % c->np;
+      array_cat(struct tag_t, &tags, &tt, 1);
+    }
+    sarray_sort(struct tag_t, tags.ptr, tags.n, tag, 0, bfr);
+  }
+
+  struct array unique;
+  array_init(struct tag_t, &unique, 1024);
+
+  if (tags.n > 0) {
+    const struct tag_t *const pt = (const struct tag_t *const)tags.ptr;
+    array_cat(struct tag_t, &unique, &pt[0], 1);
+    for (uint i = 1; i < tags.n; i++) {
+      if (pt[i].tag > pt[i - 1].tag)
+        array_cat(struct tag_t, &unique, &pt[i], 1);
+    }
+  }
+
+  sint out[2][1];
+  {
+    sarray_transfer(struct tag_t, &unique, p, 1, cr);
+    sarray_sort(struct tag_t, unique.ptr, unique.n, tag, 0, bfr);
+
+    const struct tag_t *const pu = (const struct tag_t *const)unique.ptr;
+    sint in = 0;
+    if (unique.n > 0) {
+      in = 1;
+      for (uint i = 1; i < unique.n; i++) {
+        if (pu[i].tag > pu[i - 1].tag)
+          in++;
+      }
+    }
+
+    sint wrk[2][1];
+    comm_scan(out, c, gs_int, gs_add, &in, 1, wrk);
+  }
+  const uint num_tags = out[1][0], tag_start = out[0][0];
+
+  parrsb_print(c, verbose, "Num tags: %d", num_tags);
+  if (c->np % num_tags != 0) {
+    if (c->id == 0) {
+      fprintf(stderr,
+              "Number of processes must be a multiple of number of tags: "
+              "processes = %d, tags = %d.\n",
+              c->np, num_tags);
+    }
+    exit(EXIT_FAILURE);
+  }
+
+  {
+    struct tag_t *const pu = (struct tag_t *const)unique.ptr;
+    uint start = tag_start;
+    if (unique.n > 0) {
+      pu[0].tagn = start;
+      for (uint i = 1; i < unique.n; i++) {
+        if (pu[i].tag > pu[i - 1].tag)
+          start++;
+        pu[i].tagn = start;
+      }
+    }
+
+    sarray_transfer(struct tag_t, &unique, p, 0, cr);
+    sarray_sort(struct tag_t, unique.ptr, unique.n, tag, 0, bfr);
+  }
+
+  const uint chunk_size = c->np / num_tags;
+  parrsb_print(c, verbose, "Processes per tag: %d", chunk_size);
+  {
+    struct tag_t *const pt = (struct tag_t *const)tags.ptr;
+    const struct tag_t *const pu = (const struct tag_t *const)unique.ptr;
+    for (uint i = 0, s = 0; i < unique.n; i++) {
+      uint e = s + 1;
+      assert(pt[s].tag == pu[i].tag);
+      while (e < tags.n && pt[e].tag == pu[i].tag)
+        e++;
+      for (uint j = s; j < e; j++)
+        pt[j].p = chunk_size * pu[i].tagn + pt[i].seq % chunk_size;
+      s = e;
+    }
+
+    sarray_sort(struct tag_t, tags.ptr, tags.n, seq, 0, bfr);
+  }
+  array_free(&unique);
+
+  struct element_t {
+    uint proc, part, seq;
+    scalar xyz[MAXDIM * MAXNV];
+    slong vertices[MAXNV];
+  };
+
+  struct array elements;
+  array_init(struct element_t, &elements, nel);
+
+  parrsb_print(c, verbose,
+               "Pack element data for transfering. tags.n=%u, nel=%u", tags.n,
+               nel);
+  const unsigned ndim = (nv == 8) ? 3 : 2;
+  {
+    assert(tags.n == nel);
+    const struct tag_t *const pt = (const struct tag_t *const)tags.ptr;
+    struct element_t et;
+    for (uint i = 0; i < tags.n; i++) {
+      et.proc = pt[i].p, et.seq = i;
+      for (uint j = 0; j < nv; j++) {
+        et.vertices[j] = vtx[i * nv + j];
+        for (uint k = 0; k < ndim; k++)
+          et.xyz[j * ndim + k] = xyz[i * nv * ndim + j * ndim + k];
+      }
+      array_cat(struct element_t, &elements, &et, 1);
+    }
+
+    sarray_transfer(struct element_t, &elements, proc, 1, cr);
+  }
+  array_free(&tags);
+
+  parrsb_print(c, verbose, "Copy element data for feeding to parRSB.");
+  long long *lvtx = tcalloc(long long, (elements.n + 1) * nv);
+  double *lxyz = tcalloc(double, (elements.n + 1) * nv * ndim);
+  {
+    const struct element_t *const pe =
+        (const struct element_t *const)elements.ptr;
+    for (uint e = 0; e < elements.n; e++) {
+      for (uint j = 0; j < nv; j++) {
+        lvtx[e * nv + j] = pe[e].vertices[j];
+        for (uint k = 0; k < ndim; k++)
+          lxyz[e * nv * ndim + j * ndim + k] = pe[e].xyz[j * ndim + k];
+      }
+    }
+  }
+
+  parrsb_print(c, verbose, "Run parRSB locally within a tag now.");
+  {
+    int *lpart = tcalloc(int, elements.n + 1);
+
+    struct comm lc;
+    comm_split(c, c->id / chunk_size, c->id, &lc);
+
+    struct crystal lcr;
+    crystal_init(&lcr, &lc);
+
+    options->verbose_level = 0;
+    options->profile_level = 0;
+    parrsb_part_mesh_v0(lpart, lvtx, lxyz, elements.n, nv, options, &lc, &lcr,
+                        bfr);
+    crystal_free(&lcr), comm_free(&lc);
+
+    struct element_t *const pe = (struct element_t *const)elements.ptr;
+    for (uint e = 0; e < elements.n; e++) {
+      pe[e].part = lpart[e] + (c->id / chunk_size) * chunk_size;
+      assert(pe[e].part < c->np);
+    }
+    free(lpart);
+
+    sarray_transfer(struct element_t, &elements, proc, 0, cr);
+    assert(nel == elements.n);
+  }
+  free(lvtx), free(lxyz);
+
+  {
+    sarray_sort(struct element_t, elements.ptr, elements.n, seq, 0, bfr);
+    const struct element_t *const pe =
+        (const struct element_t *const)elements.ptr;
+    for (uint i = 0; i < nel; i++)
+      part[i] = pe[i].part;
+  }
+
+  array_free(&elements);
+}
+
+static void update_frontier(sint *const target, sint *const hop,
+                            sint *const frontier, const unsigned nv,
+                            const unsigned hid, const struct comm *c,
+                            buffer *const bfr) {
+  // If target is already set, we don't update either target or hop.
+  // We simply update frontier to previous target value and return.
+  if (*target >= 0) {
+    // Check invariant: *hop < INT_MAX
+    assert(*hop < INT_MAX);
+    for (uint i = 0; i < nv; i++)
+      frontier[i] = *target;
+    return;
+  }
+
+  struct dest_t {
+    uint target;
+  };
+
+  struct array dests;
+  array_init(struct dest_t, &dests, nv);
+  {
+    struct dest_t dt;
+    for (uint i = 0; i < nv; i++) {
+      if (frontier[i] >= 0) {
+        dt.target = frontier[i];
+        array_cat(struct dest_t, &dests, &dt, 1);
+      }
+    }
+  }
+
+  if (dests.n > 0) {
+    sarray_sort(struct dest_t, dests.ptr, dests.n, target, 0, bfr);
+
+    const struct dest_t *const pd = (const struct dest_t *const)dests.ptr;
+    uint current_target = pd[0].target, current_count = 1;
+    uint final_target = current_target, final_count = 1;
+    for (uint i = 1; i < dests.n; i++) {
+      if (pd[i].target == current_target) {
+        current_count++;
+      } else {
+        if (current_count > final_count)
+          final_count = current_count, final_target = current_target;
+        current_target = pd[i].target, current_count = 1;
+      }
+    }
+    if (current_count > final_count)
+      final_target = current_target;
+
+    // Update frontier, target and hop.
+    for (uint j = 0; j < nv; j++)
+      frontier[j] = final_target;
+    *target = final_target, *hop = hid + 1;
+  }
+
+  array_free(&dests);
+}
+
+void parrsb_part_solid(int *part, const long long *const vtx2,
+                       const unsigned nel2, const long long *const vtx1,
+                       const unsigned nel1, const unsigned nv,
+                       const MPI_Comm comm) {
+  struct comm c;
+  comm_init(&c, comm);
+  parrsb_print(&c, 1, "Running greedy solid ... nel1 = %d nel2 = %d", nel1,
+               nel2);
+
+  for (uint i = 0; i < nel2; i++)
+    part[i] = -1;
+
+  buffer bfr;
+  buffer_init(&bfr, 1024);
+
+  struct crystal cr;
+  crystal_init(&cr, &c);
+
+  // Return if global size is 0.
+  const uint nelt = nel1 + nel2;
+  slong nelg = nelt;
+  {
+    slong wrk;
+    comm_allreduce(&c, gs_long, gs_add, &nelg, 1, &wrk);
+    if (nelg == 0) {
+      parrsb_print(&c, 1, "Mesh is empty ...");
+      crystal_free(&cr);
+      buffer_free(&bfr);
+      comm_free(&c);
+      return;
+    }
+  }
+
+  const size_t size1 = nel1 * nv;
+  const size_t size2 = nel2 * nv;
+  const size_t size = size1 + size2;
+
+  // Setup the gather-scatter handle to find connectivity through BFS.
+  parrsb_print(&c, 1, "Setup gather-scatter handle ...");
+  struct gs_data *gsh = NULL;
+  {
+    slong *vtx = tcalloc(slong, size);
+    for (size_t i = 0; i < size1; i++)
+      vtx[i] = vtx1[i];
+    for (size_t i = 0; i < size2; i++)
+      vtx[size1 + i] = vtx2[i];
+
+    gsh = gs_setup(vtx, size, &c, 0, gs_pairwise, 0);
+    free(vtx);
+  }
+
+  // Check if the solid + fluid mesh is connected. Otherwise, we cannot use
+  // the greedy solid partitioner.
+  parrsb_print(&c, 1, "Check if fluid + solid is connected ...");
+  {
+    slong wrk;
+    sint idmin = (c.id + 1) * (size > 0);
+    comm_allreduce(&c, gs_int, gs_min, &idmin, 1, &wrk);
+    assert(idmin > 0);
+
+    sint *const component = tcalloc(sint, size);
+    if (c.id + 1 == (uint)idmin) {
+      for (uint i = 0; i < nv; i++)
+        component[i] = 1;
+    }
+
+    slong marked0 = 0, marked1 = 1;
+    sint epoch = 0;
+    while (marked1 > marked0) {
+      gs(component, gs_int, gs_max, 0, gsh, &bfr);
+
+      marked0 = marked1, marked1 = 0;
+      for (uint i = 0; i < nel1 + nel2; i++) {
+        sint v = 0;
+        for (uint j = 0; j < nv; j++)
+          v += component[i * nv + j];
+        if (v > 0) {
+          for (uint j = 0; j < nv; j++)
+            component[i * nv + j] = 1;
+          marked1 += 1;
+        }
+      }
+
+      comm_allreduce(&c, gs_long, gs_add, &marked1, 1, &wrk);
+      parrsb_print(&c, 1, "\tepoch = %d marked0 = %lld marked1 = %lld", epoch,
+                   marked0, marked1);
+      epoch++;
+    }
+    free(component);
+
+    if (marked1 != nelg) {
+      if (c.id == 0) {
+        fprintf(stderr, "Fluid + Solid mesh is not connected.\n");
+        fflush(stderr);
+      }
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  // Calculate the global number of elements in solid mesh and expected number
+  // of elements in each partition.
+  parrsb_print(&c, 1, "Calculate expected number of elements ...");
+  slong nelgt2 = nel2;
+  uint nexp2;
+  {
+    slong wrk;
+    comm_allreduce(&c, gs_long, gs_add, &nelgt2, 1, &wrk);
+    nexp2 = nelgt2 / c.np;
+    nexp2 += (c.id < (nelgt2 - nexp2 * c.np));
+    // Check for invariant: (min(nexp2) -  max(nexp2)) <= 1.
+    slong nexp2_min = nexp2, nexp2_max = nexp2;
+    comm_allreduce(&c, gs_long, gs_min, &nexp2_min, 1, &wrk);
+    comm_allreduce(&c, gs_long, gs_max, &nexp2_max, 1, &wrk);
+    assert(nexp2_max - nexp2_min <= 1);
+    // Check for invariant: (sum(nexp2) == nelgt2).
+    slong nexp2_sum = nexp2;
+    comm_allreduce(&c, gs_long, gs_add, &nexp2_sum, 1, &wrk);
+    assert(nexp2_sum == nelgt2);
+  }
+
+  // Initialize array of elements to be sent to each partition.
+  struct elem_t {
+    sint part;
+    uint target, hop, sequence;
+  };
+
+  struct array arr;
+  array_init(struct elem_t, &arr, nel2);
+
+  // Allocate space for work arrays: frontier, target, and hop.
+  sint *const frontier = tcalloc(sint, size);
+  sint *const target = tcalloc(sint, nelt);
+  sint *const hop = tcalloc(sint, nelt);
+
+  uint nrecv2 = 0;
+  slong nrem2 = nelgt2;
+  while (nrem2 > 0) {
+    parrsb_print(&c, 1, "nrem2 = %lld", nrem2);
+
+    // Check for invariant: nrecv2 <= nexp2.
+    assert(nrecv2 <= nexp2);
+
+    // If the partition does not have enough elements, we keep it under
+    // consideration for accepting new solid elements. If the partition
+    // already has enough elements, we take that partition out of
+    // consideration (by setting the frontier to -1). We always initialize solid
+    // elements as unassigned (-1) although they may be already assigned. We
+    // check for that later when we actually assign the elements to partitions.
+    {
+      sint id = c.id, hid = 0;
+      if (nrecv2 == nexp2)
+        id = -1, hid = INT_MAX;
+
+      // Max id should be >= 0;
+      sint wrk, idmax = id;
+      comm_allreduce(&c, gs_int, gs_max, &idmax, 1, &wrk);
+      assert(idmax >= 0);
+
+      // Initialize frontier, target, and hop.
+      for (uint i = 0; i < size1; i++)
+        frontier[i] = id;
+      for (uint i = size1; i < size; i++)
+        frontier[i] = -1;
+      for (uint i = 0; i < nel1; i++)
+        target[i] = id, hop[i] = hid;
+      for (uint i = nel1; i < nelt; i++)
+        target[i] = -1, hop[i] = INT_MAX;
+    }
+
+    // Then perform a BFS till we assign all the elements in the solid mesh with
+    // a potential partition id.
+    parrsb_print(&c, 1, "Assign partition id ...");
+    {
+      sint assigned = 0;
+      slong wrk;
+      for (uint hid = 0; !assigned; hid++) {
+        gs(frontier, gs_int, gs_max, 0, gsh, &bfr);
+
+        assigned = 1;
+        slong unassigned = 0;
+        for (uint i = 0; i < nelt; i++) {
+          update_frontier(&target[i], &hop[i], &frontier[i * nv], nv, hid, &c,
+                          &bfr);
+          assigned = assigned && (target[i] >= 0);
+          unassigned += (target[i] < 0);
+        }
+
+        comm_allreduce(&c, gs_int, gs_min, &assigned, 1, &wrk);
+        comm_allreduce(&c, gs_long, gs_add, &unassigned, 1, &wrk);
+        parrsb_print(&c, 1, "hid = %d, assigned = %d unassigned = %d", hid,
+                     assigned, unassigned);
+      }
+    }
+
+    // Pack unassigned solid elements and send them to the target partition.
+    arr.n = 0;
+    {
+      struct elem_t et = {.part = -1};
+      for (uint i = 0; i < nel2; i++) {
+        if (part[i] >= 0)
+          continue;
+        et.sequence = i, et.target = target[nel1 + i], et.hop = hop[nel1 + i];
+        array_cat(struct elem_t, &arr, &et, 1);
+      }
+
+      parrsb_print(&c, 1, "Send elemenets to the target partition ...");
+      sarray_transfer(struct elem_t, &arr, target, 1, &cr);
+    }
+
+    // Assign elements if the partition still doesn't have enough elements.
+    if (nrecv2 < nexp2) {
+      // We sort by hop value. Elements with lower hop value are assigned first
+      // since they are technically closer to the partition.
+      sarray_sort(struct elem_t, arr.ptr, arr.n, hop, 1, &bfr);
+      struct elem_t *const pa = (struct elem_t *const)arr.ptr;
+      uint keep = MIN(nexp2 - nrecv2, arr.n);
+      for (uint i = 0; i < keep; i++)
+        pa[i].part = c.id;
+      nrecv2 += keep;
+      // Check for invariant: nrecv2 <= nexp2.
+      assert(nrecv2 <= nexp2);
+    }
+
+    // Send everything back with updated partition id and update the part array.
+    {
+      parrsb_print(&c, 1, "Send everything back ...");
+      sarray_transfer(struct elem_t, &arr, target, 0, &cr);
+
+      const struct elem_t *const pa = (const struct elem_t *const)arr.ptr;
+      for (uint j = 0; j < arr.n; j++)
+        part[pa[j].sequence] = pa[j].part;
+      arr.n = 0;
+    }
+
+    // Update the number of elements remaining.
+    {
+      slong wrk;
+      nrem2 = nexp2 - nrecv2;
+      comm_allreduce(&c, gs_long, gs_add, &nrem2, 1, &wrk);
+    }
+  }
+
+  gs_free(gsh);
+  free(frontier), free(target), free(hop);
+  array_free(&arr);
+  crystal_free(&cr);
+  buffer_free(&bfr);
+  comm_free(&c);
+}
+
+int parrsb_part_mesh(int *part, const long long *const vtx,
+                     const double *const xyz, const int *const tag,
+                     const int nel, const int nv, parrsb_options *const options,
+                     MPI_Comm comm) {
+  struct comm c;
+  comm_init(&c, comm);
+
+  update_options(options);
+
+  // Check verboity and print a message.
+  const int verbose = options->verbose_level;
+  {
+    slong nelg = nel, wrk;
+    comm_allreduce(&c, gs_long, gs_add, &nelg, 1, &wrk);
+    parrsb_print(&c, verbose, "Running parRSB ..., nv = %d, nelg = %lld", nv,
+                 nelg);
+  }
+
+  print_options(&c, options);
+
+  if (options->tagged == 1 && !tag) {
+    parrsb_print(&c, verbose,
+                 "Tagged partitioning requested but tag array is NULL..");
+    return 1;
+  }
+
+  buffer bfr;
+  buffer_init(&bfr, (nel + 1) * 72);
+
+  struct crystal cr;
+  crystal_init(&cr, &c);
+
+  metric_init();
+
+  parrsb_barrier(&c);
+  const double t = comm_time();
+
+  if (options->tagged == 1)
+    parrsb_part_mesh_v1(part, vtx, xyz, tag, nel, nv, options, &c, &cr, &bfr);
+  if (options->tagged == 0)
+    parrsb_part_mesh_v0(part, vtx, xyz, nel, nv, options, &c, &cr, &bfr);
+
+  parrsb_print(&c, verbose, "par%s finished in %g seconds.",
+               ALGO[options->partitioner], comm_time() - t);
+
+  metric_rsb_print(&c, options->profile_level);
+  metric_finalize();
+
+  crystal_free(&cr);
+  buffer_free(&bfr);
+  comm_free(&c);
+
+  return 0;
+}
+
+#undef MIN
diff --git a/src/rcb.c b/src/rcb.c
index 5defa8c5..a695ab5f 100644
--- a/src/rcb.c
+++ b/src/rcb.c
@@ -1,8 +1,11 @@
 #include "parrsb-impl.h"
 #include "sort.h"
 
+#include <float.h>
+#include <string.h>
+
 static void get_axis_len(double *length, size_t unit_size, char *elems,
-                         uint nel, int ndim, struct comm *c) {
+                         uint nel, uint ndim, struct comm *c) {
   double min[3] = {DBL_MAX, DBL_MAX, DBL_MAX},
          max[3] = {-DBL_MAX, -DBL_MAX, -DBL_MAX};
 
@@ -140,23 +143,17 @@ static int rcb_level(struct array *a, size_t unit_size, int ndim,
 
 int rcb(struct array *elements, size_t unit_size, int ndim, struct comm *ci,
         buffer *bfr) {
-  struct comm c, t;
+  struct comm c;
   comm_dup(&c, ci);
 
-  int size = c.np;
-  int rank = c.id;
-
+  uint size = c.np, rank = c.id;
   while (size > 1) {
     rcb_level(elements, unit_size, ndim, &c, bfr);
 
-    int bin = 1;
-    if (rank < (size + 1) / 2)
-      bin = 0;
-
+    struct comm t;
+    const int bin = ((rank >= (size + 1) / 2) ? 1 : 0);
     comm_split(&c, bin, rank, &t);
-    comm_free(&c);
-    comm_dup(&c, &t);
-    comm_free(&t);
+    comm_free(&c), comm_dup(&c, &t), comm_free(&t);
 
     size = c.np, rank = c.id;
   }
diff --git a/src/rib.c b/src/rib.c
index 1aac9031..b64d8771 100644
--- a/src/rib.c
+++ b/src/rib.c
@@ -96,23 +96,16 @@ int rib(struct array *elements, size_t unit_size, int ndim, struct comm *ci,
   struct comm c;
   comm_dup(&c, ci);
 
-  int size = c.np;
-  int rank = c.id;
-
+  uint size = c.np, rank = c.id;
   while (size > 1) {
     rib_level(elements, unit_size, ndim, &c, bfr);
 
-    int p = (size + 1) / 2;
-    int bin = (rank >= p);
-
-    MPI_Comm comm_rib;
-    MPI_Comm_split(c.c, bin, rank, &comm_rib);
-    comm_free(&c);
-    comm_init(&c, comm_rib);
-    MPI_Comm_free(&comm_rib);
+    struct comm t;
+    const int bin = ((rank >= (size + 1) / 2) ? 1 : 0);
+    comm_split(&c, bin, rank, &t);
+    comm_free(&c), comm_dup(&c, &t), comm_free(&t);
 
-    size = c.np;
-    rank = c.id;
+    size = c.np, rank = c.id;
   }
   comm_free(&c);
 
diff --git a/src/rsb-aux.c b/src/rsb-aux.c
deleted file mode 100644
index 04e500c4..00000000
--- a/src/rsb-aux.c
+++ /dev/null
@@ -1,396 +0,0 @@
-#include "metrics.h"
-#include "parrsb-impl.h"
-#include "sort.h"
-
-static unsigned disconnected = 0;
-
-extern int fiedler(struct array *elements, int nv, parrsb_options *options,
-                   struct comm *gsc, buffer *buf, int verbose);
-
-static void test_component_versions(struct array *elements, struct comm *lc,
-                                    unsigned nv, unsigned lvl, buffer *bfr) {
-  // Send elements to % P processor to test disconnected components
-  struct crystal cr;
-  crystal_init(&cr, lc);
-
-  struct rsb_element *pe = (struct rsb_element *)elements->ptr;
-  for (unsigned e = 0; e < elements->n; e++)
-    pe[e].proc = pe[e].globalId % lc->np;
-
-  sarray_transfer(struct rsb_element, elements, proc, 1, &cr);
-
-  MPI_Comm tmp;
-  int color = (lc->id < lc->np / 2);
-  MPI_Comm_split(lc->c, color, lc->id, &tmp);
-
-  struct comm tc0;
-  comm_init(&tc0, tmp);
-
-  sint nc1 = get_components(NULL, elements, nv, &tc0, bfr, 0);
-  sint nc2 = get_components_v2(NULL, elements, nv, &tc0, bfr, 0);
-  if (nc1 != nc2) {
-    if (tc0.id == 0)
-      printf("lvl = %u SS BFS != MS BFS: %d %d\n", lvl, nc1, nc2);
-    fflush(stdout);
-  }
-  if (nc1 > 1) {
-    if (tc0.id == 0)
-      printf("lvl = %u: %d disconnected componets were present.\n", lvl, nc1);
-    fflush(stdout);
-  }
-
-  comm_free(&tc0);
-  MPI_Comm_free(&tmp);
-
-  sarray_transfer(struct rsb_element, elements, proc, 0, &cr);
-  crystal_free(&cr);
-}
-
-static void check_rsb_partition(struct comm *gc, parrsb_options *opts) {
-  int max_levels = log2ll(gc->np);
-  int miter = opts->rsb_max_iter, mpass = opts->rsb_max_passes;
-
-  for (int i = 0; i < max_levels; i++) {
-    sint converged = 1;
-    int val = (int)metric_get_value(i, RSB_FIEDLER_CALC_NITER);
-    if (opts->rsb_algo == 0) {
-      if (val == miter * mpass)
-        converged = 0;
-    } else if (opts->rsb_algo == 1) {
-      if (val == mpass)
-        converged = 0;
-    }
-
-    struct comm c;
-    comm_split(gc, converged, gc->id, &c);
-
-    slong bfr[4];
-    if (converged == 0) {
-      if (opts->rsb_algo == 0) {
-        double init = metric_get_value(i, TOL_INIT);
-        comm_allreduce(&c, gs_double, gs_min, &init, 1, (void *)bfr);
-
-        double target = metric_get_value(i, TOL_TGT);
-        comm_allreduce(&c, gs_double, gs_min, &target, 1, (void *)bfr);
-
-        double final = metric_get_value(i, TOL_FNL);
-        comm_allreduce(&c, gs_double, gs_min, &final, 1, (void *)bfr);
-        if (c.id == 0) {
-          printf("Warning: Lanczos reached a residual of %lf (target: %lf) "
-                 "after %d x %d iterations in Level=%d!\n",
-                 final, target, mpass, miter, i);
-          fflush(stdout);
-        }
-      } else if (opts->rsb_algo == 1) {
-        if (c.id == 0) {
-          printf("Warning: Inverse iteration didn't converge after %d "
-                 "iterations in Level = %d\n",
-                 mpass, i);
-          fflush(stdout);
-        }
-      }
-    }
-    comm_free(&c);
-
-    sint minc, maxc;
-    minc = maxc = (sint)metric_get_value(i, RSB_COMPONENTS);
-    comm_allreduce(gc, gs_int, gs_min, &minc, 1, (void *)bfr);
-    comm_allreduce(gc, gs_int, gs_max, &maxc, 1, (void *)bfr);
-
-    if (maxc > 1 && gc->id == 0) {
-      printf("Warning: Partition created %d/%d (min/max) disconnected "
-             "components in Level=%d!\n",
-             minc, maxc, i);
-      fflush(stdout);
-    }
-  }
-}
-
-static int check_bin_val(int bin, struct comm *c) {
-  if (bin < 0 || bin > 1) {
-    if (c->id == 0) {
-      printf("%s:%d bin value out of range: %d\n", __FILE__, __LINE__, bin);
-      fflush(stdout);
-    }
-    return 1;
-  }
-  return 0;
-}
-
-int balance_partitions(struct array *elements, int nv, struct comm *lc,
-                       struct comm *gc, int bin, buffer *bfr) {
-  assert(check_bin_val(bin, gc) == 0);
-
-  struct ielem_t {
-    uint index, orig;
-    sint dest;
-    scalar fiedler;
-  };
-
-  // Calculate expected # of elements per processor
-  uint ne = elements->n;
-  slong nelgt = ne, nglob = ne, wrk;
-  comm_allreduce(lc, gs_long, gs_add, &nelgt, 1, &wrk);
-  comm_allreduce(gc, gs_long, gs_add, &nglob, 1, &wrk);
-
-  sint ne_ = nglob / gc->np, nrem = nglob - ne_ * gc->np;
-  slong nelgt_exp = ne_ * lc->np + nrem / 2 + (nrem % 2) * (1 - bin);
-  slong send_cnt = nelgt - nelgt_exp > 0 ? nelgt - nelgt_exp : 0;
-
-  // Setup gather-scatter
-  uint size = ne * nv, e, v;
-  slong *ids = tcalloc(slong, size);
-  struct rsb_element *elems = (struct rsb_element *)elements->ptr;
-  for (e = 0; e < ne; e++) {
-    for (v = 0; v < nv; v++)
-      ids[e * nv + v] = elems[e].vertices[v];
-  }
-  struct gs_data *gsh = gs_setup(ids, size, gc, 0, gs_pairwise, 0);
-
-  sint *input = (sint *)ids;
-  if (send_cnt > 0)
-    for (e = 0; e < size; e++)
-      input[e] = 0;
-  else
-    for (e = 0; e < size; e++)
-      input[e] = 1;
-
-  gs(input, gs_int, gs_add, 0, gsh, bfr);
-
-  for (e = 0; e < ne; e++)
-    elems[e].proc = gc->id;
-
-  sint sid = (send_cnt == 0) ? gc->id : INT_MAX, balanced = 0;
-  comm_allreduce(gc, gs_int, gs_min, &sid, 1, &wrk);
-
-  struct crystal cr;
-
-  if (send_cnt > 0) {
-    struct array ielems;
-    array_init(struct ielem_t, &ielems, 10);
-
-    struct ielem_t ielem = {
-        .index = 0, .orig = lc->id, .dest = -1, .fiedler = 0};
-    int mul = (sid == 0) ? 1 : -1;
-    for (e = 0; e < ne; e++) {
-      for (v = 0; v < nv; v++) {
-        if (input[e * nv + v] > 0) {
-          ielem.index = e, ielem.fiedler = mul * elems[e].fiedler;
-          array_cat(struct ielem_t, &ielems, &ielem, 1);
-          break;
-        }
-      }
-    }
-
-    // Sort based on fiedler value and sets `orig` field
-    parallel_sort(struct ielem_t, &ielems, fiedler, gs_double, 0, 1, lc, bfr);
-
-    slong out[2][1], bfr[2][1], nielems = ielems.n;
-    comm_scan(out, lc, gs_long, gs_add, &nielems, 1, bfr);
-    slong start = out[0][0];
-
-    sint P = gc->np - lc->np;
-    sint part_size = (send_cnt + P - 1) / P;
-
-    if (out[1][0] >= send_cnt) {
-      balanced = 1;
-      struct ielem_t *ptr = ielems.ptr;
-      for (e = 0; start + e < send_cnt && e < ielems.n; e++)
-        ptr[e].dest = sid + (start + e) / part_size;
-
-      crystal_init(&cr, lc);
-      sarray_transfer(struct ielem_t, &ielems, orig, 0, &cr);
-      crystal_free(&cr);
-
-      ptr = ielems.ptr;
-      for (e = 0; e < ielems.n; e++)
-        if (ptr[e].dest != -1)
-          elems[ptr[e].index].proc = ptr[e].dest;
-    }
-
-    array_free(&ielems);
-  }
-
-  comm_allreduce(gc, gs_int, gs_max, &balanced, 1, &wrk);
-  if (balanced == 1) {
-    crystal_init(&cr, gc);
-    sarray_transfer(struct rsb_element, elements, proc, 0, &cr);
-    crystal_free(&cr);
-
-    // Do a load balanced sort in each partition
-    parallel_sort(struct rsb_element, elements, fiedler, gs_double, 0, 1, lc,
-                  bfr);
-  } else {
-    // Forget about disconnected components, just do a load balanced partition
-    // TODO: Need to change how parallel_sort load balance
-    parallel_sort(struct rsb_element, elements, fiedler, gs_double, 0, 1, gc,
-                  bfr);
-  }
-
-  free(ids), gs_free(gsh);
-  return 0;
-}
-
-int repair_partitions_v2(struct array *elems, unsigned nv, struct comm *tc,
-                         struct comm *lc, unsigned bin, unsigned algo,
-                         buffer *bfr) {
-  assert(check_bin_val(bin, lc) == 0);
-
-  sint ibuf;
-  sint nc = get_components_v2(NULL, elems, nv, tc, bfr, 0);
-  comm_allreduce(lc, gs_int, gs_max, &nc, 1, &ibuf);
-  if (nc > 1) {
-    // If nc > 1, send elements back and do RCBx, RCBy and RCBz
-    struct crystal cr;
-    crystal_init(&cr, lc);
-    sarray_transfer(struct rsb_element, elems, proc, 0, &cr);
-    crystal_free(&cr);
-
-    // Do rcb or rib
-    unsigned ndim = (nv == 8) ? 3 : 2;
-    switch (algo) {
-    case 0:
-      parallel_sort(struct rsb_element, elems, globalId, gs_long, 0, 1, lc,
-                    bfr);
-      break;
-    case 1:
-      rcb(elems, sizeof(struct rsb_element), ndim, lc, bfr);
-      break;
-    case 2:
-      rib(elems, sizeof(struct rsb_element), ndim, lc, bfr);
-      break;
-    default:
-      break;
-    }
-
-    // And count number of components again. If nc > 1 still, set
-    // isconnected = 1
-    nc = get_components_v2(NULL, elems, nv, tc, bfr, 0);
-    comm_allreduce(lc, gs_int, gs_max, &nc, 1, &ibuf);
-    if (nc > 1)
-      disconnected = 1;
-  }
-
-  return 0;
-}
-
-static void get_part(sint *np, sint *nid, int two_lvl, struct comm *lc,
-                     struct comm *nc) {
-  if (two_lvl) {
-    sint out[2][1], wrk[2][1], in = (nc->id == 0);
-    comm_scan(out, lc, gs_int, gs_add, &in, 1, &wrk);
-    *nid = (nc->id == 0) * out[0][0], *np = out[1][0];
-    comm_allreduce(nc, gs_int, gs_max, nid, 1, wrk);
-  } else {
-    *np = lc->np, *nid = lc->id;
-  }
-}
-
-int rsb(struct array *elements, int nv, int check, parrsb_options *options,
-        struct comm *gc, buffer *bfr) {
-  // `gc` is the global communicator. We make a duplicate of it in `lc` and
-  // keep splitting it. `nc` is the communicator for the two level partitioning.
-  struct comm lc, nc;
-
-  // Duplicate the global communicator to `lc`
-  comm_dup(&lc, gc);
-
-  // Initialize `nc` based on `lc`
-  if (options->two_level) {
-#ifdef MPI
-    MPI_Comm node;
-    MPI_Comm_split_type(lc.c, MPI_COMM_TYPE_SHARED, lc.id, MPI_INFO_NULL,
-                        &node);
-    comm_init(&nc, node);
-    MPI_Comm_free(&node);
-#else
-    comm_init(&nc, 1);
-#endif
-  }
-
-  // Get number of partitions we are going to perform RSB on first level
-  sint np, nid;
-  get_part(&np, &nid, options->two_level, &lc, &nc);
-  debug_print(gc, options->two_level && options->verbose_level,
-              "Number of nodes = %d\n", np);
-
-  struct comm tc;
-  unsigned ndim = (nv == 8) ? 3 : 2;
-  while (np > 1) {
-    // Run the pre-partitioner
-    debug_print(&lc, options->verbose_level > 1, "\tPre-partitioner ...");
-    metric_tic(&lc, RSB_PRE);
-    switch (options->rsb_pre) {
-    case 0: // Sort by global id
-      parallel_sort(struct rsb_element, elements, globalId, gs_long, 0, 1, &lc,
-                    bfr);
-      break;
-    case 1: // RCB
-      rcb(elements, sizeof(struct rsb_element), ndim, &lc, bfr);
-      break;
-    case 2: // RIB
-      rib(elements, sizeof(struct rsb_element), ndim, &lc, bfr);
-      break;
-    default:
-      break;
-    }
-    metric_toc(&lc, RSB_PRE);
-    debug_print(&lc, options->verbose_level > 1, " done.\n");
-
-    // Find the Fiedler vector
-    debug_print(&lc, options->verbose_level > 1, "\tFiedler ...");
-    unsigned bin = (nid >= (np + 1) / 2);
-    comm_split(&lc, bin, lc.id, &tc);
-
-    struct rsb_element *pe = (struct rsb_element *)elements->ptr;
-    for (unsigned i = 0; i < elements->n; i++)
-      pe[i].proc = lc.id;
-
-    metric_tic(&lc, RSB_FIEDLER);
-    fiedler(elements, nv, options, &lc, bfr, gc->id == 0);
-    metric_toc(&lc, RSB_FIEDLER);
-    debug_print(&lc, options->verbose_level > 1, " done.\n");
-
-    // Sort by Fiedler vector
-    debug_print(&lc, options->verbose_level > 1, "\tSort ...");
-    metric_tic(&lc, RSB_SORT);
-    parallel_sort_2(struct rsb_element, elements, fiedler, gs_double, globalId,
-                    gs_long, 0, 1, &lc, bfr);
-    metric_toc(&lc, RSB_SORT);
-    debug_print(&lc, options->verbose_level > 1, " done.\n");
-
-    // Attempt to repair if there are disconnected components
-    debug_print(&lc, options->verbose_level > 1, "\tRepair ...");
-    metric_tic(&lc, RSB_REPAIR);
-    if (options->repair)
-      repair_partitions_v2(elements, nv, &tc, &lc, bin, options->rsb_pre, bfr);
-    metric_toc(&lc, RSB_REPAIR);
-    debug_print(&lc, options->verbose_level > 1, " done.\n");
-
-    // Bisect and balance
-    debug_print(&lc, options->verbose_level > 1, "\tBalance ...");
-    metric_tic(&lc, RSB_BALANCE);
-    balance_partitions(elements, nv, &tc, &lc, bin, bfr);
-    metric_toc(&lc, RSB_BALANCE);
-    debug_print(&lc, options->verbose_level > 1, " done.\n");
-
-    // Split the communicator and recurse on the sub-problems.
-    comm_free(&lc), comm_dup(&lc, &tc), comm_free(&tc);
-    get_part(&np, &nid, options->two_level, &lc, &nc);
-    debug_print(&lc, options->verbose_level > 1, "\tBisect ... done.\n");
-    metric_push_level();
-  }
-  comm_free(&lc);
-
-  // Partition within the node
-  if (options->two_level) {
-    options->two_level = 0;
-    rsb(elements, nv, 0, options, &nc, bfr);
-    comm_free(&nc);
-  }
-
-  if (check)
-    check_rsb_partition(gc, options);
-
-  return 0;
-}
diff --git a/src/rsb.c b/src/rsb.c
index d38da260..066b00fe 100644
--- a/src/rsb.c
+++ b/src/rsb.c
@@ -1,250 +1,400 @@
 #include "metrics.h"
 #include "parrsb-impl.h"
-#include <ctype.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-
-extern int rsb(struct array *elements, int nv, int check,
-               parrsb_options *options, struct comm *gc, buffer *bfr);
-extern int rcb(struct array *elements, size_t unit_size, int ndim,
-               struct comm *ci, buffer *bfr);
-
-parrsb_options parrsb_default_options = {
-    // General options
-    .partitioner = 0,
-    .verbose_level = 0,
-    .profile_level = 0,
-    .two_level = 1,
-    .repair = 0,
-    // RSB common (Lanczos + MG) options
-    .rsb_algo = 0,
-    .rsb_pre = 1,
-    .rsb_max_iter = 50,
-    .rsb_max_passes = 50,
-    .rsb_tol = 1e-5,
-    // RSB MG specific options
-    .rsb_mg_grammian = 0,
-    .rsb_mg_factor = 2,
-    .rsb_mg_sagg = 0};
-
-static char *ALGO[3] = {"RSB", "RCB", "RIB"};
-
-#define UPDATE_OPTION(OPT, STR, IS_INT)                                        \
-  do {                                                                         \
-    const char *val = getenv(STR);                                             \
-    if (val != NULL) {                                                         \
-      if (IS_INT)                                                              \
-        options->OPT = atoi(val);                                              \
-      else                                                                     \
-        options->OPT = atof(val);                                              \
-    }                                                                          \
-  } while (0)
-
-static void update_options(parrsb_options *options) {
-  UPDATE_OPTION(partitioner, "PARRSB_PARTITIONER", 1);
-  UPDATE_OPTION(verbose_level, "PARRSB_VERBOSE_LEVEL", 1);
-  UPDATE_OPTION(profile_level, "PARRSB_PROFILE_LEVEL", 1);
-  UPDATE_OPTION(two_level, "PARRSB_TWO_LEVEL", 1);
-  UPDATE_OPTION(repair, "PARRSB_REPAIR", 1);
-  UPDATE_OPTION(rsb_algo, "PARRSB_RSB_ALGO", 1);
-  UPDATE_OPTION(rsb_pre, "PARRSB_RSB_PRE", 1);
-  UPDATE_OPTION(rsb_max_iter, "PARRSB_RSB_MAX_ITER", 1);
-  UPDATE_OPTION(rsb_max_passes, "PARRSB_RSB_MAX_PASSES", 1);
-  UPDATE_OPTION(rsb_tol, "PARRSB_RSB_TOL", 0);
-  UPDATE_OPTION(rsb_mg_grammian, "PARRSB_RSB_MG_GRAMMIAN", 1);
-  UPDATE_OPTION(rsb_mg_factor, "PARRSB_RSB_MG_FACTOR", 1);
-  UPDATE_OPTION(rsb_mg_sagg, "PARRSB_RSB_MG_SMOOTH_AGGREGATION", 1);
-  if (options->verbose_level == 0)
-    options->profile_level = 0;
-}
+#include "sort.h"
 
-#undef UPDATE_OPTION
-
-#define PRINT_OPTION(OPT, STR, FMT) printf("%s = " FMT "\n", STR, options->OPT)
-
-static void print_options(parrsb_options *options) {
-  PRINT_OPTION(partitioner, "PARRSB_PARTITIONER", "%d");
-  PRINT_OPTION(verbose_level, "PARRSB_VERBOSE_LEVEL", "%d");
-  PRINT_OPTION(profile_level, "PARRSB_PROFILE_LEVEL", "%d");
-  PRINT_OPTION(two_level, "PARRSB_TWO_LEVEL", "%d");
-  PRINT_OPTION(repair, "PARRSB_REPAIR", "%d");
-  PRINT_OPTION(rsb_algo, "PARRSB_RSB_ALGO", "%d");
-  PRINT_OPTION(rsb_pre, "PARRSB_RSB_PRE", "%d");
-  PRINT_OPTION(rsb_max_iter, "PARRSB_RSB_MAX_ITER", "%d");
-  PRINT_OPTION(rsb_max_passes, "PARRSB_RSB_MAX_PASSES", "%d");
-  PRINT_OPTION(rsb_tol, "PARRSB_RSB_TOL", "%lf");
-  PRINT_OPTION(rsb_mg_grammian, "PARRSB_RSB_MG_GRAMMIAN", "%d");
-  PRINT_OPTION(rsb_mg_factor, "PARRSB_RSB_MG_FACTOR", "%d");
-  PRINT_OPTION(rsb_mg_sagg, "PARRSB_RSB_MG_SMOOTH_AGGREGATION", "%d");
-}
+extern int fiedler(struct array *elements, int nv,
+                   const parrsb_options *const options, struct comm *gsc,
+                   buffer *buf, int verbose);
 
-#undef PRINT_OPTION
-
-static size_t load_balance(struct array *elist, uint nel, int nv, double *coord,
-                           long long *vtx, struct crystal *cr, buffer *bfr) {
-  struct comm *c = &cr->comm;
-  slong out[2][1], wrk[2][1], in = nel;
-  comm_scan(out, c, gs_long, gs_add, &in, 1, wrk);
-  slong start = out[0][0], nelg = out[1][0];
-
-  uint nstar = nelg / c->np, nrem = nelg - nstar * c->np;
-  slong lower = (nstar + 1) * nrem;
-
-  size_t unit_size;
-  if (vtx == NULL) // RCB
-    unit_size = sizeof(struct rcb_element);
-  else // RSB
-    unit_size = sizeof(struct rsb_element);
-
-  array_init_(elist, nel, unit_size, __FILE__, __LINE__);
-
-  struct rcb_element *pe = (struct rcb_element *)calloc(1, unit_size);
-  pe->origin = c->id;
-
-  int ndim = (nv == 8) ? 3 : 2;
-  for (uint e = 0; e < nel; ++e) {
-    slong eg = pe->globalId = start + e + 1;
-    if (nstar == 0)
-      pe->proc = eg - 1;
-    else if (eg <= lower)
-      pe->proc = (eg - 1) / (nstar + 1);
-    else
-      pe->proc = (eg - 1 - lower) / nstar + nrem;
-
-    pe->coord[0] = pe->coord[1] = pe->coord[2] = 0.0;
-    for (int v = 0; v < nv; v++)
-      for (int n = 0; n < ndim; n++)
-        pe->coord[n] += coord[e * ndim * nv + v * ndim + n];
-    for (int n = 0; n < ndim; n++)
-      pe->coord[n] /= nv;
-
-    array_cat_(unit_size, elist, pe, 1, __FILE__, __LINE__);
-  }
+static void test_component_versions(struct array *elements, struct comm *lc,
+                                    unsigned nv, unsigned lvl, buffer *bfr) {
+  // Send elements to % P processor to create disconnected components.
+  struct crystal cr;
+  crystal_init(&cr, lc);
+
+  struct rsb_element *pe = (struct rsb_element *)elements->ptr;
+  for (unsigned e = 0; e < elements->n; e++)
+    pe[e].proc = pe[e].globalId % lc->np;
+
+  sarray_transfer(struct rsb_element, elements, proc, 1, &cr);
 
-  if (vtx != NULL) { // RSB
-    struct rsb_element *pr = (struct rsb_element *)elist->ptr;
-    for (uint e = 0; e < nel; e++) {
-      for (int v = 0; v < nv; v++)
-        pr[e].vertices[v] = vtx[e * nv + v];
+  struct comm tc0;
+  int color = (lc->id < lc->np / 2);
+  comm_split(lc, color, lc->id, &tc0);
+
+  sint nc1 = get_components(NULL, elements, nv, &tc0, bfr, 0);
+  sint nc2 = get_components_v2(NULL, elements, nv, &tc0, bfr, 0);
+  if (nc1 != nc2) {
+    if (tc0.id == 0) {
+      fprintf(stderr, "Error: Level = %u SS BFS != MS BFS: %d %d\n", lvl, nc1,
+              nc2);
+      fflush(stderr);
     }
+    exit(EXIT_FAILURE);
+  }
+  if (nc1 > 1) {
+    if (tc0.id == 0)
+      printf("Warning: Level = %u has %d disconnected components.\n", lvl, nc1);
+    fflush(stdout);
   }
 
-  sarray_transfer_(elist, unit_size, offsetof(struct rcb_element, proc), 1, cr);
-  if (vtx == NULL) // RCB
-    sarray_sort(struct rcb_element, elist->ptr, elist->n, globalId, 1, bfr);
-  else // RSB
-    sarray_sort(struct rsb_element, elist->ptr, elist->n, globalId, 1, bfr);
-
-  free(pe);
-  return unit_size;
+  comm_free(&tc0);
+  sarray_transfer(struct rsb_element, elements, proc, 0, &cr);
+  crystal_free(&cr);
 }
 
-static void restore_original(int *part, int *seq, struct crystal *cr,
-                             struct array *elist, size_t usize, buffer *bfr) {
-  sarray_transfer_(elist, usize, offsetof(struct rcb_element, origin), 1, cr);
-  uint nel = elist->n;
-
-  if (usize == sizeof(struct rsb_element)) // RSB
-    sarray_sort(struct rsb_element, elist->ptr, nel, globalId, 1, bfr);
-  else if (usize == sizeof(struct rcb_element)) // RCB
-    sarray_sort(struct rcb_element, elist->ptr, nel, globalId, 1, bfr);
-
-  struct rcb_element *element;
-  uint e;
-  for (e = 0; e < nel; e++) {
-    element = (struct rcb_element *)((char *)elist->ptr + e * usize);
-    part[e] = element->origin; // element[e].origin;
-  }
+static void check_rsb_partition(const struct comm *gc,
+                                const parrsb_options *const opts) {
+  int max_levels = log2ll(gc->np);
+  int miter = opts->rsb_max_iter, mpass = opts->rsb_max_passes;
+
+  for (int i = 0; i < max_levels; i++) {
+    sint converged = 1;
+    int val = (int)metric_get_value(i, RSB_FIEDLER_CALC_NITER);
+    if (opts->rsb_algo == 0) {
+      if (val == miter * mpass)
+        converged = 0;
+    } else if (opts->rsb_algo == 1) {
+      if (val == mpass)
+        converged = 0;
+    }
 
-  if (seq != NULL) {
-    for (e = 0; e < nel; e++) {
-      element = (struct rcb_element *)((char *)elist->ptr + e * usize);
-      seq[e] = element->seq; // element[e].seq;
+    struct comm c;
+    comm_split(gc, converged, gc->id, &c);
+
+    slong bfr[4];
+    if (converged == 0) {
+      if (opts->rsb_algo == 0) {
+        double init = metric_get_value(i, TOL_INIT);
+        comm_allreduce(&c, gs_double, gs_min, &init, 1, (void *)bfr);
+
+        double target = metric_get_value(i, TOL_TGT);
+        comm_allreduce(&c, gs_double, gs_min, &target, 1, (void *)bfr);
+
+        double final = metric_get_value(i, TOL_FNL);
+        comm_allreduce(&c, gs_double, gs_min, &final, 1, (void *)bfr);
+        if (c.id == 0) {
+          printf("Warning: Lanczos reached a residual of %lf (target: %lf) "
+                 "after %d x %d iterations in Level=%d!\n",
+                 final, target, mpass, miter, i);
+          fflush(stdout);
+        }
+      } else if (opts->rsb_algo == 1) {
+        if (c.id == 0) {
+          printf("Warning: Inverse iteration didn't converge after %d "
+                 "iterations in Level = %d\n",
+                 mpass, i);
+          fflush(stdout);
+        }
+      }
+    }
+    comm_free(&c);
+
+    sint minc, maxc;
+    minc = maxc = (sint)metric_get_value(i, RSB_COMPONENTS_NCOMP);
+    comm_allreduce(gc, gs_int, gs_min, &minc, 1, (void *)bfr);
+    comm_allreduce(gc, gs_int, gs_max, &maxc, 1, (void *)bfr);
+
+    if (maxc > 1 && gc->id == 0) {
+      printf("Warning: Partition created %d/%d (min/max) disconnected "
+             "components in Level=%d!\n",
+             minc, maxc, i);
+      fflush(stdout);
     }
   }
 }
 
-int parrsb_part_mesh(int *part, int *seq, long long *vtx, double *coord,
-                     int nel, int nv, parrsb_options options, MPI_Comm comm) {
-  struct comm c;
-  comm_init(&c, comm);
+static int check_bin_val(int bin, struct comm *c) {
+  if (bin < 0 || bin > 1) {
+    if (c->id == 0) {
+      printf("%s:%d bin value out of range: %d\n", __FILE__, __LINE__, bin);
+      fflush(stdout);
+    }
+    return 1;
+  }
+  return 0;
+}
 
-  slong nelg = nel, wrk;
-  comm_allreduce(&c, gs_long, gs_add, &nelg, 1, &wrk);
+static int balance_partitions(struct array *elements, unsigned nv,
+                              struct comm *lc, struct comm *gc, int bin,
+                              buffer *bfr) {
+  // Return if there is only one processor.
+  if (gc->np == 1)
+    return 0;
+
+  assert(check_bin_val(bin, gc) == 0);
+
+  struct ielem_t {
+    uint index, orig;
+    sint dest;
+    scalar fiedler;
+  };
+
+  // Calculate expected # of elements per processor.
+  uint ne = elements->n;
+  slong nelgt = ne, nglob = ne, wrk;
+  comm_allreduce(lc, gs_long, gs_add, &nelgt, 1, &wrk);
+  comm_allreduce(gc, gs_long, gs_add, &nglob, 1, &wrk);
+
+  sint ne_ = nglob / gc->np, nrem = nglob - ne_ * gc->np;
+  slong nelgt_exp = ne_ * lc->np + nrem / 2 + (nrem % 2) * (1 - bin);
+  slong send_cnt = nelgt - nelgt_exp > 0 ? nelgt - nelgt_exp : 0;
+
+  // Setup gather-scatter.
+  size_t size = ne * nv;
+  uint e, v;
+  slong *ids = tcalloc(slong, size);
+  struct rsb_element *elems = (struct rsb_element *)elements->ptr;
+  for (e = 0; e < ne; e++) {
+    for (v = 0; v < nv; v++)
+      ids[e * nv + v] = elems[e].vertices[v];
+  }
+  struct gs_data *gsh = gs_setup(ids, size, gc, 0, gs_pairwise, 0);
+
+  sint *input = (sint *)ids;
+  if (send_cnt > 0) {
+    for (e = 0; e < size; e++)
+      input[e] = 0;
+  } else {
+    for (e = 0; e < size; e++)
+      input[e] = 1;
+  }
 
-  update_options(&options);
+  gs(input, gs_int, gs_add, 0, gsh, bfr);
 
-  debug_print(&c, options.verbose_level,
-              "Running parRSB ..., nv = %d, nelg = %lld\n", nv, nelg);
-  if (c.id == 0 && options.verbose_level > 0)
-    print_options(&options);
-  fflush(stdout);
+  for (e = 0; e < ne; e++)
+    elems[e].proc = gc->id;
 
-  parrsb_barrier(&c);
-  double t = comm_time();
+  sint sid = (send_cnt == 0) ? gc->id : INT_MAX, balanced = 0;
+  comm_allreduce(gc, gs_int, gs_min, &sid, 1, &wrk);
 
   struct crystal cr;
-  crystal_init(&cr, &c);
-
-  buffer bfr;
-  buffer_init(&bfr, (nel + 1) * sizeof(struct rsb_element));
-
-  // Load balance input data
-  debug_print(&c, options.verbose_level, "Load balance ...");
-  struct array elist;
-  size_t esize = load_balance(&elist, nel, nv, coord, vtx, &cr, &bfr);
-  debug_print(&c, options.verbose_level, " done.\n");
-
-  // Run RSB now
-  debug_print(&c, options.verbose_level, "Running the partitioner ...");
-  struct comm ca;
-  comm_split(&c, elist.n > 0, c.id, &ca);
-  metric_init();
-  if (elist.n > 0) {
-    slong out[2][1], wrk[2][1], in = elist.n;
-    comm_scan(out, &ca, gs_long, gs_add, &in, 1, wrk);
-    slong nelg = out[1][0];
-
-    int ndim = (nv == 8) ? 3 : 2;
-    switch (options.partitioner) {
+
+  if (send_cnt > 0) {
+    struct array ielems;
+    array_init(struct ielem_t, &ielems, 10);
+
+    struct ielem_t ielem = {
+        .index = 0, .orig = lc->id, .dest = -1, .fiedler = 0};
+    int mul = (sid == 0) ? 1 : -1;
+    for (e = 0; e < ne; e++) {
+      for (v = 0; v < nv; v++) {
+        if (input[e * nv + v] > 0) {
+          ielem.index = e, ielem.fiedler = mul * elems[e].fiedler;
+          array_cat(struct ielem_t, &ielems, &ielem, 1);
+          break;
+        }
+      }
+    }
+
+    // Sort based on fiedler value and sets `orig` field
+    parallel_sort(struct ielem_t, &ielems, fiedler, gs_double, 0, 1, lc, bfr);
+
+    slong out[2][1], bfr[2][1], nielems = ielems.n;
+    comm_scan(out, lc, gs_long, gs_add, &nielems, 1, bfr);
+    slong start = out[0][0];
+
+    sint P = gc->np - lc->np;
+    sint part_size = (send_cnt + P - 1) / P;
+
+    if (out[1][0] >= send_cnt) {
+      balanced = 1;
+      struct ielem_t *ptr = ielems.ptr;
+      for (e = 0; start + e < send_cnt && e < ielems.n; e++)
+        ptr[e].dest = sid + (start + e) / part_size;
+
+      crystal_init(&cr, lc);
+      sarray_transfer(struct ielem_t, &ielems, orig, 0, &cr);
+      crystal_free(&cr);
+
+      ptr = ielems.ptr;
+      for (e = 0; e < ielems.n; e++)
+        if (ptr[e].dest != -1)
+          elems[ptr[e].index].proc = ptr[e].dest;
+    }
+
+    array_free(&ielems);
+  }
+
+  comm_allreduce(gc, gs_int, gs_max, &balanced, 1, &wrk);
+  if (balanced == 1) {
+    crystal_init(&cr, gc);
+    sarray_transfer(struct rsb_element, elements, proc, 0, &cr);
+    crystal_free(&cr);
+
+    // Do a load balanced sort in each partition
+    parallel_sort(struct rsb_element, elements, fiedler, gs_double, 0, 1, lc,
+                  bfr);
+  } else {
+    // Forget about disconnected components, just do a load balanced partition
+    parallel_sort(struct rsb_element, elements, fiedler, gs_double, 0, 1, gc,
+                  bfr);
+  }
+
+  free(ids), gs_free(gsh);
+  return 0;
+}
+
+static int repair_partitions_v2(struct array *elems, unsigned nv,
+                                struct comm *tc, struct comm *lc, unsigned bin,
+                                unsigned algo, buffer *bfr) {
+  assert(check_bin_val(bin, lc) == 0);
+
+  sint nc = get_components_v2(NULL, elems, nv, tc, bfr, 0), wrk;
+  comm_allreduce(lc, gs_int, gs_max, &nc, 1, &wrk);
+  if (nc > 1) {
+    // If nc > 1, send elements back and do RCBx, RCBy and RCBz
+    struct crystal cr;
+    crystal_init(&cr, lc);
+    sarray_transfer(struct rsb_element, elems, proc, 0, &cr);
+    crystal_free(&cr);
+
+    // Do rcb or rib
+    unsigned ndim = (nv == 8) ? 3 : 2;
+    switch (algo) {
     case 0:
-      rsb(&elist, nv, 1, &options, &ca, &bfr);
+      parallel_sort(struct rsb_element, elems, globalId, gs_long, 0, 1, lc,
+                    bfr);
       break;
     case 1:
-      rcb(&elist, esize, ndim, &ca, &bfr);
+      rcb(elems, sizeof(struct rsb_element), ndim, lc, bfr);
       break;
     case 2:
-      rib(&elist, esize, ndim, &ca, &bfr);
+      rib(elems, sizeof(struct rsb_element), ndim, lc, bfr);
       break;
     default:
       break;
     }
 
-    metric_rsb_print(&ca, options.profile_level);
+    // And count number of components again. If nc > 1 still, set
+    // isconnected = 1
+    nc = get_components_v2(NULL, elems, nv, tc, bfr, 0);
+    comm_allreduce(lc, gs_int, gs_max, &nc, 1, &wrk);
   }
-  metric_finalize(), comm_free(&ca);
-  debug_print(&c, options.verbose_level, " done.\n");
 
-  debug_print(&c, options.verbose_level, "Restore the original input ...");
-  restore_original(part, seq, &cr, &elist, esize, &bfr);
-  debug_print(&c, options.verbose_level, " done.\n");
+  return 0;
+}
+
+static sint get_bisect_comm(struct comm *const tc, const struct comm *const lc,
+                            const uint level, const uint levels,
+                            const struct comm comms[3]) {
+  sint pid, psize;
+  if (level < levels - 1) {
+    sint out[2][1], wrk[2][1], in = (comms[level + 1].id == 0);
+    comm_scan(out, &comms[level], gs_int, gs_add, &in, 1, wrk);
+    psize = out[1][0], pid = (comms[level + 1].id == 0) * out[0][0];
+    comm_allreduce(&comms[level + 1], gs_int, gs_max, &pid, 1, wrk);
+  } else {
+    psize = lc->np, pid = lc->id;
+  }
 
-  // Report time and finish
-  parrsb_barrier(&c);
-  debug_print(&c, 1, "par%s finished in %g seconds.\n",
-              ALGO[options.partitioner], comm_time() - t);
+  const sint bin = (pid >= (psize + 1) / 2);
+  comm_split(lc, bin, lc->id, tc);
+  return bin;
+}
 
-  array_free(&elist), buffer_free(&bfr), crystal_free(&cr), comm_free(&c);
+static uint get_level_cuts(const uint level, const uint levels,
+                           const struct comm comms[3]) {
+  uint n;
+  if (level < levels - 1) {
+    sint size = (comms[level + 1].id == 0), wrk;
+    comm_allreduce(&comms[level], gs_int, gs_add, &size, 1, &wrk);
+    n = size;
+  } else {
+    n = comms[level].np;
+  }
 
-  return 0;
+  sint cuts = 0;
+  uint pow2 = 1;
+  while (pow2 < n)
+    pow2 <<= 1, cuts++;
+
+  sint wrk;
+  comm_allreduce(&comms[0], gs_int, gs_max, &cuts, 1, &wrk);
+
+  return cuts;
 }
 
-void fparrsb_part_mesh(int *part, int *seq, long long *vtx, double *coord,
-                       int *nel, int *nv, int *options, int *comm, int *err) {
-  *err = 1;
-  comm_ext c = MPI_Comm_f2c(*comm);
-  parrsb_options opt = parrsb_default_options;
-  *err = parrsb_part_mesh(part, seq, vtx, coord, *nel, *nv, opt, c);
+void rsb(struct array *elements, int nv, const parrsb_options *const options,
+         const struct comm comms[3], buffer *bfr) {
+  const unsigned levels = options->levels;
+  const sint verbose = options->verbose_level;
+  const uint ndim = (nv == 8) ? 3 : 2;
+  const struct comm *gc = &comms[0];
+  for (uint level = 0; level < levels; level++) {
+    // Find the maximum number of RSB cuts in current level.
+    uint ncuts = get_level_cuts(level, levels, comms);
+    parrsb_print(gc, verbose, "rsb: Level=%u/%u number of cuts = %u", level + 1,
+                 levels, ncuts);
+
+    struct comm lc;
+    comm_dup(&lc, &comms[level]);
+    for (uint cut = 0; cut < ncuts; cut++) {
+      // Run the pre-partitioner.
+      parrsb_print(gc, verbose - 1, "\trsb: Pre-partition ...");
+
+      metric_tic(&lc, RSB_PRE);
+      switch (options->rsb_pre) {
+      case 0:
+        parallel_sort(struct rsb_element, elements, globalId, gs_long, 0, 1,
+                      &lc, bfr);
+        break;
+      case 1:
+        rcb(elements, sizeof(struct rsb_element), ndim, &lc, bfr);
+        break;
+      case 2:
+        rib(elements, sizeof(struct rsb_element), ndim, &lc, bfr);
+        break;
+      default:
+        break;
+      }
+      metric_toc(&lc, RSB_PRE);
+
+      struct rsb_element *const pe = (struct rsb_element *const)elements->ptr;
+      for (unsigned i = 0; i < elements->n; i++)
+        pe[i].proc = lc.id;
+
+      // Find the Fiedler vector.
+      parrsb_print(gc, verbose - 1, "\trsb: Fiedler ... ");
+      metric_tic(&lc, RSB_FIEDLER);
+      fiedler(elements, nv, options, &lc, bfr, verbose - 2);
+      metric_toc(&lc, RSB_FIEDLER);
+
+      // Sort by Fiedler vector.
+      parrsb_print(gc, verbose - 1, "\trsb: Sort ...");
+      metric_tic(&lc, RSB_SORT);
+      parallel_sort(struct rsb_element, elements, fiedler, gs_double, 0, 1, &lc,
+                    bfr);
+      metric_toc(&lc, RSB_SORT);
+
+      // `tc` is the new communicator in newly found partitions.
+      struct comm tc;
+      sint bin = get_bisect_comm(&tc, &lc, level, levels, comms);
+
+      // Find the number of disconnected components.
+      parrsb_print(gc, verbose - 1, "\trsb: Components ...");
+      metric_tic(&lc, RSB_COMPONENTS);
+      const uint ncomp =
+          get_components_v2(NULL, elements, nv, &tc, bfr, verbose - 2);
+      metric_acc(RSB_COMPONENTS_NCOMP, ncomp);
+      metric_toc(&lc, RSB_COMPONENTS);
+
+      // Bisect and balance.
+      parrsb_print(gc, verbose - 1, "\trsb: Balance ...");
+      metric_tic(&lc, RSB_BALANCE);
+      balance_partitions(elements, nv, &tc, &lc, bin, bfr);
+      metric_toc(&lc, RSB_BALANCE);
+
+      // Split the communicator and recurse on the sub-problems.
+      parrsb_print(gc, verbose - 1, "\trsb: Bisect ...");
+      comm_free(&lc), comm_dup(&lc, &tc), comm_free(&tc);
+
+      const uint nbrs = parrsb_get_neighbors(elements, nv, gc, &lc, bfr);
+      metric_acc(RSB_NEIGHBORS, nbrs);
+      metric_push_level();
+    }
+    comm_free(&lc);
+  }
+
+  check_rsb_partition(gc, options);
 }
diff --git a/src/schur.c b/src/schur.c
deleted file mode 100644
index a94bcfac..00000000
--- a/src/schur.c
+++ /dev/null
@@ -1,1264 +0,0 @@
-#include "coarse-impl.h"
-#include "metrics.h"
-#include "multigrid.h"
-#include <math.h>
-
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-//------------------------------------------------------------------------------
-// Cholesky factorization of a matrix
-//
-/*
-symbolic factorization: finds the sparsity structure of L
-
-uses the concept of elimination tree:
-  the parent of node j is node i when L(i,j) is the first
-    non-zero in column j below the diagonal (i>j)
-  L's structure is discovered row-by-row; the first time
-    an entry in column j is set, it must be the parent
-
-the nonzeros in L are the nonzeros in A + paths up the elimination tree
-
-linear in the number of nonzeros of L
-*/
-static uint *cholesky_symbolic(struct mat *L, uint n, uint const *Ap,
-                               uint const *Ai, buffer *buf) {
-  L->n = n;
-
-  uint *parent = tcalloc(uint, 2 * n), *visit = parent + n;
-  uint i, j, nz = 0;
-  for (i = 0; i < n; i++) {
-    parent[i] = n, visit[i] = i;
-    for (uint p = Ap[i]; p < Ap[i + 1]; p++) {
-      if ((j = Ai[p]) >= i)
-        break;
-      for (; visit[j] != i; j = parent[j]) {
-        ++nz, visit[j] = i;
-        if (parent[j] == n) {
-          parent[j] = i;
-          break;
-        }
-      }
-    }
-  }
-
-  uint *Lp = L->Lp = tcalloc(uint, n + 1);
-  uint *Li = L->Li = tcalloc(uint, nz);
-
-  Lp[0] = 0;
-  uint *Lir, nzr;
-  for (i = 0; i < n; i++) {
-    nzr = 0, Lir = &Li[Lp[i]];
-    visit[i] = i;
-    for (uint p = Ap[i]; p < Ap[i + 1]; p++) {
-      if ((j = Ai[p]) >= i)
-        break;
-      for (; visit[j] != i; j = parent[j])
-        Lir[nzr++] = j, visit[j] = i;
-    }
-    sortv(Lir, Lir, nzr, sizeof(uint), buf);
-    Lp[i + 1] = Lp[i] + nzr;
-  }
-
-  free(parent);
-  return 0;
-}
-
-/*
-numeric factorization:
-
-L is built row-by-row, using:    ( ' indicates transpose )
-
-
-[ A  r ]  = [ (I-L)   ] [ D^(-1)  ] [ (I-L)' -s ]
-[ r' a ]    [  -s'  1 ] [     1/d ] [         1 ]
-
-          = [ A   (I-L) D^(-1) (-s)  ]
-            [ r'  s' D^(-1) s + 1/d  ]
-
-so, if r' is the next row of A, up to but excluding the diagonal,
-then the next row of L, s', obeys
-
-   r = - (I-L) D^(-1) s
-
-let y = (I-L)^(-1) (-r)
-then s = D y, and d = 1/(a - s' y)
-*/
-static void cholesky_numeric(struct mat *chol, const uint n, const uint *Ap,
-                             const uint *Ai, const scalar *A, uint *visit,
-                             scalar *y) {
-  const uint *Lp = chol->Lp, *Li = chol->Li;
-  scalar *D = chol->D = tcalloc(scalar, n);
-  scalar *L = chol->L = tcalloc(scalar, Lp[n]);
-
-  uint i;
-  for (i = 0; i < n; i++) {
-    uint p, pe, j;
-    scalar a;
-    visit[i] = n;
-    for (p = Lp[i], pe = Lp[i + 1]; p != pe; p++)
-      j = Li[p], y[j] = 0, visit[j] = i;
-    for (p = Ap[i], pe = Ap[i + 1]; p != pe; p++) {
-      if ((j = Ai[p]) >= i) {
-        if (j == i)
-          a = A[p];
-        break;
-      }
-      y[j] = -A[p];
-    }
-    for (p = Lp[i], pe = Lp[i + 1]; p != pe; p++) {
-      uint j = Li[p], q = Lp[j], qe = Lp[j + 1];
-      scalar yj = y[j];
-      for (; q != qe; q++) {
-        uint k = Li[q];
-        if (visit[k] == i)
-          yj += L[q] * y[k];
-      }
-      y[j] = yj;
-      scalar lij = L[p] = D[j] * yj;
-      a -= lij * yj;
-    }
-    D[i] = 1 / a;
-  }
-}
-
-static void cholesky_factor(struct mat *L, struct mat *A, uint null_space,
-                            buffer *buf) {
-  L->start = A->start;
-  const uint uints_as_dbls =
-      (A->n * sizeof(uint) + sizeof(double) - 1) / sizeof(double);
-  buffer_reserve(buf, (uints_as_dbls + A->n - null_space) * sizeof(double));
-  cholesky_symbolic(L, A->n - null_space, A->Lp, A->Li, buf);
-  cholesky_numeric(L, L->n, A->Lp, A->Li, A->L, buf->ptr,
-                   uints_as_dbls + (double *)buf->ptr);
-  A->n = A->n - null_space;
-}
-
-static void cholesky_solve(scalar *x, const struct mat *A, scalar *b) {
-  const uint *Lp = A->Lp, *Li = A->Li, n = A->n;
-  const scalar *L = A->L, *D = A->D;
-
-  uint i, p, pe;
-  for (i = 0; i < n; i++) {
-    scalar xi = b[i];
-    for (p = Lp[i], pe = Lp[i + 1]; p != pe; p++)
-      xi += x[Li[p]] * L[p];
-    x[i] = xi;
-  }
-
-  for (i = 0; i < n; i++)
-    x[i] *= D[i];
-
-  for (i = n; i > 0;) {
-    scalar xi = x[--i];
-    for (p = Lp[i], pe = Lp[i + 1]; p != pe; p++) {
-      x[Li[p]] += xi * L[p];
-    }
-    x[i] = xi;
-  }
-}
-
-static void cholesky_lower_solve(scalar *x, const struct mat *A, scalar *b) {
-  const uint *Lp = A->Lp, *Li = A->Li, n = A->n;
-  const scalar *L = A->L, *D = A->D;
-
-  uint i, p, pe;
-  for (i = 0; i < n; i++) {
-    scalar xi = b[i];
-    for (p = Lp[i], pe = Lp[i + 1]; p != pe; p++)
-      xi += x[Li[p]] * L[p];
-    x[i] = xi;
-  }
-
-  for (i = 0; i < n; i++)
-    x[i] *= sqrt(D[i]);
-}
-
-static void cholesky_upper_solve(scalar *x, const struct mat *A, scalar *b) {
-  const uint *Lp = A->Lp, *Li = A->Li, n = A->n;
-  const scalar *L = A->L, *D = A->D;
-
-  uint i;
-  for (i = 0; i < n; i++)
-    x[i] = b[i] * sqrt(D[i]);
-
-  uint p, pe;
-  for (i = n; i > 0;) {
-    scalar xi = x[--i];
-    for (p = Lp[i], pe = Lp[i + 1]; p != pe; p++) {
-      x[Li[p]] += xi * L[p];
-    }
-    x[i] = xi;
-  }
-}
-
-//-----------------------------------------------------------------------------
-// Schur setup, solve and free
-//
-// A_ll: local dof of a processor (block diagonal across processors)
-// A_sl: shared - local matrix
-// A_ss: shared dof freedom (matrix is split row wise)
-//
-//     |A_ll (B)  A_ls (F)|
-//  A= |                  |
-//     |A_sl (E)  A_ss (S)|
-//
-struct schur {
-  struct mat A_ll;
-  struct par_mat A_ls, A_sl, A_ss;
-  struct gs_data *Q_ls, *Q_sl, *Q_ss;
-  struct mg *M;
-};
-
-static int S_owns_row(const ulong r, const ulong *rows, const uint n) {
-  // We can do a binary search instead of linear search
-  uint i = 0;
-  while (i < n && rows[i] != r)
-    i++;
-  return i;
-}
-
-// Calculate G = L_{B}^{-1} x F where B = L_{B} U_{B}. F is in CSR format,
-// distributed by rows similar to B. G will be in CSC format and distributed
-// by columns similar to row distribution of S.
-static int schur_setup_G(struct par_mat *G, scalar tol, const struct mat *L,
-                         const struct par_mat *F, const ulong *srows,
-                         const uint srn, struct crystal *const cr,
-                         buffer *bfr) {
-  assert(IS_CSR(F));
-  assert(!IS_DIAG(F));
-
-  buffer_reserve(bfr, sizeof(scalar) * L->n * F->cn);
-  scalar *v = (scalar *)bfr->ptr;
-  for (uint i = 0; i < L->n * F->cn; i++)
-    v[i] = 0;
-
-  // Do L_B^{-1} x F now. Columns of L_B^{-1} are found one by one and
-  // then they are multplied by F. Is the above description correct?
-  scalar *b = tcalloc(scalar, 2 * L->n);
-  scalar *x = b + L->n;
-  for (uint i = 0; i < F->rn; i++) {
-    b[F->rows[i] - L->start] = 1;
-    cholesky_lower_solve(x, L, b);
-
-    // Calculate F: i^th row of F is multiplied by each element of i^th
-    // column of L_B^-1
-    for (uint k = F->adj_off[i], ke = F->adj_off[i + 1]; k < ke; k++)
-      for (uint j = 0; j < L->n; j++)
-        // m.c = F->cols[F->adj_idx[k]], m.r = L->start + j;
-        v[j * F->cn + F->adj_idx[k]] += F->adj_val[k] * x[j];
-
-    b[F->rows[i] - L->start] = 0;
-    for (uint j = 0; j < L->n; j++)
-      x[j] = 0;
-  }
-
-  uint size = L->n * 20 + 1;
-  struct array unique;
-  array_init(struct mij, &unique, size);
-
-  struct comm *c = &cr->comm;
-  struct mij m = {.r = 0, .c = 0, .idx = 1, .p = 0, .v = 0};
-  for (uint i = 0; i < L->n; i++) {
-    for (uint j = 0; j < F->cn; j++) {
-      if (fabs(v[i * F->cn + j]) >= tol) {
-        m.r = L->start + i, m.c = F->cols[j], m.p = m.c % c->np;
-        m.v = v[i * F->cn + j];
-        array_cat(struct mij, &unique, &m, 1);
-      }
-    }
-  }
-
-  m.r = 0, m.idx = 0, m.v = 0;
-  for (uint i = 0; i < srn; i++) {
-    m.c = srows[i], m.p = m.c % c->np;
-    array_cat(struct mij, &unique, &m, 1);
-  }
-
-  sarray_transfer(struct mij, &unique, p, 1, cr);
-
-  struct array mijs;
-  array_init(struct mij, &mijs, unique.n);
-  if (unique.n > 0) {
-    sarray_sort_2(struct mij, unique.ptr, unique.n, c, 1, idx, 0, bfr);
-    struct mij *pu = (struct mij *)unique.ptr;
-    uint i = 0, j = 1;
-    for (; j < unique.n; j++) {
-      if (pu[j].c != pu[i].c) {
-        assert(pu[i].idx == 0);
-        for (uint k = i + 1; k < j; k++) {
-          pu[k].p = pu[i].p;
-          array_cat(struct mij, &mijs, &pu[k], 1);
-        }
-        i = j;
-      }
-    }
-    assert(pu[i].idx == 0);
-    for (uint k = i + 1; k < unique.n; k++) {
-      pu[k].p = pu[i].p;
-      array_cat(struct mij, &mijs, &pu[k], 1);
-    }
-  }
-  array_free(&unique);
-
-  sarray_transfer(struct mij, &mijs, p, 0, cr);
-  par_csc_setup(G, &mijs, 0, bfr);
-  array_free(&mijs);
-#ifdef DUMPG
-  par_mat_print(G);
-#endif
-
-  return 0;
-}
-
-// Calculate W = E x U_{B}^{-1} where B = L_{B} U_{B}. E is in CSC format.
-// W will be in CSR format and distributed by rows similar to distribution of S.
-static int schur_setup_W(struct par_mat *W, scalar tol, const struct mat *L,
-                         const struct par_mat *E, const ulong *srows,
-                         const uint srn, struct crystal *const cr,
-                         buffer *bfr) {
-  assert(IS_CSC(E));
-  assert(!IS_DIAG(E));
-
-  buffer_reserve(bfr, sizeof(scalar) * L->n * E->rn);
-  scalar *v = (scalar *)bfr->ptr;
-  for (uint i = 0; i < L->n * E->rn; i++)
-    v[i] = 0;
-
-  // Multiply E by U_B^{-1} now. Columns of U_B^{-1} are found one by one and
-  // then E is multiplied by each column.
-  scalar *b = tcalloc(scalar, 2 * L->n);
-  scalar *x = b + L->n;
-  for (uint i = 0; i < L->n; i++) {
-    b[i] = 1;
-    cholesky_upper_solve(x, L, b);
-
-    // Multiply E by x: i^th col of E is multiplied by element x[i]
-    for (uint j = 0; j < E->cn; j++)
-      for (uint k = E->adj_off[j], ke = E->adj_off[j + 1]; k < ke; k++)
-        // m.c = L->start + i, m.r = E->rows[E->adj_idx[k]];
-        v[E->adj_idx[k] * L->n + i] += E->adj_val[k] * x[E->cols[j] - L->start];
-
-    b[i] = 0;
-    for (uint j = 0; j < L->n; j++)
-      x[j] = 0;
-  }
-
-  uint size = E->rn * 20 + 1;
-  struct array unique;
-  array_init(struct mij, &unique, size);
-
-  struct comm *c = &cr->comm;
-  struct mij m = {.r = 0, .c = 0, .idx = 1, .p = 0, .v = 0};
-  for (uint i = 0; i < E->rn; i++) {
-    for (uint j = 0; j < L->n; j++) {
-      if (fabs(v[i * L->n + j]) >= tol) {
-        m.r = E->rows[i], m.c = L->start + j, m.p = m.r % c->np;
-        m.v = v[i * L->n + j];
-        array_cat(struct mij, &unique, &m, 1);
-      }
-    }
-  }
-
-  m.c = 0, m.idx = 0, m.v = 0;
-  for (uint i = 0; i < srn; i++) {
-    m.r = srows[i], m.p = m.r % c->np;
-    array_cat(struct mij, &unique, &m, 1);
-  }
-
-  sarray_transfer(struct mij, &unique, p, 1, cr);
-
-  struct array mijs;
-  array_init(struct mij, &mijs, unique.n);
-  if (unique.n > 0) {
-    sarray_sort_2(struct mij, unique.ptr, unique.n, r, 1, idx, 0, bfr);
-    struct mij *pu = (struct mij *)unique.ptr;
-    uint i = 0, j = 1;
-    for (; j < unique.n; j++) {
-      if (pu[j].r != pu[i].r) {
-        assert(pu[i].idx == 0);
-        for (uint k = i + 1; k < j; k++) {
-          pu[k].p = pu[i].p;
-          array_cat(struct mij, &mijs, &pu[k], 1);
-        }
-        i = j;
-      }
-    }
-    assert(pu[i].idx == 0);
-    for (uint k = i + 1; k < unique.n; k++) {
-      pu[k].p = pu[i].p;
-      array_cat(struct mij, &mijs, &pu[k], 1);
-    }
-  }
-  array_free(&unique);
-
-  sarray_transfer(struct mij, &mijs, p, 0, cr);
-  par_csr_setup(W, &mijs, 0, bfr);
-  array_free(&mijs);
-#ifdef DUMPW
-  par_mat_print(W);
-#endif
-
-  return 0;
-}
-
-// C = A - B; A and B should be in CSR format with the same row
-// distribution across processors
-static int sparse_sub(struct par_mat *C, const struct par_mat *A,
-                      const struct par_mat *B, buffer *bfr) {
-  assert(IS_CSR(A));
-  assert(IS_CSR(B));
-
-  struct array cij;
-  array_init(struct mij, &cij, 100);
-
-  struct mij m;
-  uint r, j, je;
-  for (r = 0; r < B->rn; r++) {
-    m.r = B->rows[r];
-    for (j = B->adj_off[r], je = B->adj_off[r + 1]; j != je; j++) {
-      m.c = B->cols[B->adj_idx[j]], m.v = -B->adj_val[j];
-      array_cat(struct mij, &cij, &m, 1);
-    }
-  }
-  if (IS_DIAG(B)) {
-    for (r = 0; r < B->rn; r++) {
-      m.r = m.c = B->rows[r], m.v = -B->diag_val[r];
-      array_cat(struct mij, &cij, &m, 1);
-    }
-  }
-
-  for (r = 0; r < A->rn; r++) {
-    m.r = A->rows[r];
-    for (j = A->adj_off[r], je = A->adj_off[r + 1]; j != je; j++) {
-      m.c = A->cols[A->adj_idx[j]], m.v = A->adj_val[j];
-      array_cat(struct mij, &cij, &m, 1);
-    }
-  }
-  if (IS_DIAG(A)) {
-    for (r = 0; r < A->rn; r++) {
-      m.r = A->rows[r], m.c = A->rows[r], m.v = A->diag_val[r];
-      array_cat(struct mij, &cij, &m, 1);
-    }
-  }
-
-  struct array unique;
-  array_init(struct mij, &unique, 100);
-  if (cij.n > 0) {
-    sarray_sort_2(struct mij, cij.ptr, cij.n, r, 1, c, 1, bfr);
-    struct mij *ptr = (struct mij *)cij.ptr;
-    uint i = 0;
-    while (i < cij.n) {
-      scalar s = 0;
-      for (j = i; j < cij.n && ptr[j].r == ptr[i].r && ptr[j].c == ptr[i].c;
-           j++)
-        s += ptr[j].v;
-      m = ptr[i], m.v = s;
-      array_cat(struct mij, &unique, &m, 1);
-      i = j;
-    }
-  }
-  array_free(&cij);
-
-  par_csr_setup(C, &unique, 1, bfr);
-  array_free(&unique);
-
-  return 0;
-}
-
-int sparse_gemm(struct par_mat *WG, const struct par_mat *W,
-                const struct par_mat *G, int diag_wg, struct crystal *cr,
-                buffer *bfr) {
-  // W is in CSR, G is in CSC; we multiply rows of W by shifting
-  // the columns of G from processor to processor. This is not scalable
-  // at all -- need to do a 2D partition of the matrices W and G.
-  assert(IS_CSR(W) && !IS_DIAG(W));
-  assert(IS_CSC(G));
-
-  // Put G into an array to transfer from processor to processor
-  struct array gij, sij;
-  array_init(struct mij, &gij, 100);
-  array_init(struct mij, &sij, 100);
-
-  struct mij m = {.r = 0, .c = 0, .idx = 0, .p = cr->comm.id, .v = 0};
-  uint i, j, je;
-  for (i = 0; i < G->cn; i++) {
-    m.c = G->cols[i];
-    for (j = G->adj_off[i], je = G->adj_off[i + 1]; j != je; j++) {
-      m.r = G->rows[G->adj_idx[j]];
-      m.v = G->adj_val[j];
-      array_cat(struct mij, &gij, &m, 1);
-    }
-  }
-  if (IS_DIAG(G)) {
-    for (i = 0; i < G->cn; i++) {
-      m.c = m.r = G->cols[i];
-      m.v = G->diag_val[i];
-      array_cat(struct mij, &gij, &m, 1);
-    }
-  }
-
-  sarray_sort_2(struct mij, gij.ptr, gij.n, c, 1, r, 1, bfr);
-  struct mij *pg = (struct mij *)gij.ptr;
-  for (i = 0; i < gij.n; i++)
-    pg[i].idx = i;
-
-  for (uint p = 0; p < cr->comm.np; p++) {
-    // Calculate dot product of each row of W with columns of G
-    for (i = 0; i < W->rn; i++) {
-      m.r = W->rows[i];
-      uint s = 0, e = 0;
-      while (s < gij.n) {
-        m.c = pg[s].c, m.v = 0;
-        for (j = W->adj_off[i], je = W->adj_off[i + 1]; j < je; j++) {
-          ulong k = W->cols[W->adj_idx[j]];
-          while (e < gij.n && pg[s].c == pg[e].c && pg[e].r < k)
-            e++;
-          if (e < gij.n && pg[s].c == pg[e].c && pg[e].r == k)
-            m.v += W->adj_val[j] * pg[e].v;
-        }
-        while (e < gij.n && pg[s].c == pg[e].c)
-          e++;
-        if (fabs(m.v) > 1e-12)
-          array_cat(struct mij, &sij, &m, 1);
-        s = e;
-      }
-    }
-
-    sint next = (cr->comm.id + 1) % cr->comm.np;
-    for (i = 0; i < gij.n; i++)
-      pg[i].p = next;
-    sarray_transfer(struct mij, &gij, p, 0, cr);
-
-    sarray_sort(struct mij, gij.ptr, gij.n, idx, 0, bfr);
-    pg = gij.ptr;
-  }
-
-  par_csr_setup(WG, &sij, diag_wg, bfr);
-  array_free(&gij), array_free(&sij);
-
-  return 0;
-}
-
-static struct mg *
-schur_precond_setup(const struct mat *L, const struct par_mat *F,
-                    const struct par_mat *S, const struct par_mat *E, ulong si,
-                    uint ni, struct crystal *cr, buffer *bfr) {
-  // TODO: Sparsify W and G when they are built
-  struct par_mat W, G, WG;
-
-  struct comm *c = &cr->comm;
-  comm_barrier(c);
-  double t = comm_time();
-
-  double tol = 1e-12;
-  char *val = getenv("PARRSB_SCHUR_TOL");
-  if (val)
-    tol = atof(val);
-  schur_setup_G(&G, tol, L, F, S->rows, S->rn, cr, bfr);
-
-  t = comm_time() - t;
-  double wrk, min = t, max = t;
-  comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk);
-  comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk);
-  if (c->id == 0) {
-    printf("\tSetup G          : %g %g (min max)\n", min, max);
-    fflush(stdout);
-  }
-
-  comm_barrier(c);
-  t = comm_time();
-
-  schur_setup_W(&W, tol, L, E, S->rows, S->rn, cr, bfr);
-
-  min = max = comm_time() - t;
-  comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk);
-  comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk);
-  if (c->id == 0) {
-    printf("\tSetup W          : %g %g (min max)\n", min, max);
-    fflush(stdout);
-  }
-
-  comm_barrier(c);
-  t = comm_time();
-
-  sparse_gemm(&WG, &W, &G, 0, cr, bfr);
-
-  min = max = comm_time() - t;
-  comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk);
-  comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk);
-  if (c->id == 0) {
-    printf("\tSparse gemm      : %g %g (min max)\n", min, max);
-    fflush(stdout);
-  }
-
-#ifdef DUMPWG
-  par_mat_print(&WG);
-#endif
-
-  comm_barrier(c);
-  t = comm_time();
-
-  // P is CSR
-  struct par_mat *P = tcalloc(struct par_mat, 1);
-  sparse_sub(P, S, &WG, bfr);
-
-  min = max = comm_time() - t;
-  comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk);
-  comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk);
-  if (c->id == 0) {
-    printf("\tSparse sub       : %g %g (min max)\n", min, max);
-    fflush(stdout);
-  }
-
-#ifdef DUMPP
-  par_mat_print(P);
-#endif
-
-  par_mat_free(&W), par_mat_free(&G), par_mat_free(&WG);
-
-  comm_barrier(c);
-  t = comm_time();
-
-  int factor = 2;
-  val = getenv("PARRSB_SCHUR_MG_FACTOR");
-  if (val)
-    factor = atoi(val);
-  struct mg *precond = mg_setup(P, factor, 0, cr, bfr);
-
-  min = max = comm_time() - t;
-  comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk);
-  comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk);
-  if (c->id == 0) {
-    printf("\tMG precond       : %g %g (min max)\n", min, max);
-    fflush(stdout);
-  }
-
-  return precond;
-}
-
-static struct gs_data *setup_Ezl_Q(struct par_mat *E, ulong s, uint n,
-                                   struct comm *c, buffer *bfr) {
-  assert(IS_CSC(E));
-  assert(!IS_DIAG(E));
-
-  buffer_reserve(bfr, sizeof(slong) * (n + E->rn));
-  slong *ids = (slong *)bfr->ptr;
-  uint i, j;
-  for (i = 0; i < n; i++)
-    ids[i] = s + i;
-  for (j = 0; j < E->rn; j++, i++)
-    ids[i] = -E->rows[j];
-
-#if 0
-  comm_barrier(c);
-  for (uint p = 0; p < c->np; p++) {
-    if (c->id == p) {
-      printf("\np = %d, s = %u ids = ", p, n + E->rn);
-      for (uint i = 0; i < n + E->rn; i++) {
-        printf("%lld ", ids[i]);
-        fflush(stdout);
-      }
-      printf("\n");
-    }
-    comm_barrier(c);
-  }
-#endif
-
-  return gs_setup(ids, n + E->rn, c, 0, gs_auto, 0);
-}
-
-static int Ezl(scalar *y, const struct par_mat *E, struct gs_data *gsh,
-               const scalar *zl, const ulong s, const uint n, buffer *bfr) {
-  assert(IS_CSC(E));
-  assert(!IS_DIAG(E));
-
-  uint nn = n + E->rn;
-  scalar *wrk = (scalar *)tcalloc(scalar, nn);
-  scalar *ye = wrk + n;
-  for (uint i = 0; i < E->cn; i++) {
-    scalar zlk = zl[E->cols[i] - s];
-    for (uint j = E->adj_off[i], je = E->adj_off[i + 1]; j < je; j++)
-      ye[E->adj_idx[j]] += zlk * E->adj_val[j];
-  }
-
-#if 0
-  for (uint i = 0; i < n + E->rn; i++) {
-    printf("wrk in = %u, E->rn = %u, E->cn = %u, i = %u, %lf\n", n, E->rn,
-           E->cn, i, wrk[i]);
-    fflush(stdout);
-  }
-#endif
-
-  gs(wrk, gs_double, gs_add, 1, gsh, bfr);
-
-  for (uint i = 0; i < n; i++)
-    y[i] = wrk[i];
-
-  free(wrk);
-
-  return 0;
-}
-
-static struct gs_data *setup_Fxi_Q(struct par_mat *F, ulong s, uint n,
-                                   struct comm *c, buffer *bfr) {
-  assert(IS_CSR(F));
-  assert(!IS_DIAG(F));
-
-  uint nnz = F->rn > 0 ? F->adj_off[F->rn] : 0;
-  buffer_reserve(bfr, sizeof(slong) * (n + nnz));
-  slong *ids = (slong *)bfr->ptr;
-  uint i, j;
-  for (i = 0; i < nnz; i++)
-    ids[i] = F->cols[F->adj_idx[i]];
-  for (j = 0; j < n; j++, i++)
-    ids[i] = -(s + j);
-
-  return gs_setup(ids, i, c, 0, gs_pairwise, 0);
-}
-
-static int Fxi(scalar *y, const struct par_mat *F, struct gs_data *gsh,
-               scalar *xi, const ulong s, const uint n, buffer *bfr) {
-  assert(IS_CSR(F));
-  assert(!IS_DIAG(F));
-
-  uint nnz = F->rn > 0 ? F->adj_off[F->rn] : 0;
-  scalar *wrk = (scalar *)tcalloc(scalar, nnz + n);
-  uint i, j;
-  for (i = 0; i < nnz; i++)
-    wrk[i] = 0;
-  for (j = 0; j < n; j++, i++)
-    wrk[i] = xi[j];
-
-  gs(wrk, gs_double, gs_add, 1, gsh, bfr);
-
-  for (i = 0; i < F->rn; i++) {
-    scalar si = 0;
-    for (uint j = F->adj_off[i], je = F->adj_off[i + 1]; j < je; j++)
-      si += F->adj_val[j] * wrk[j];
-    y[F->rows[i] - s] = si;
-  }
-  return 0;
-}
-
-static int distribute_by_columns(struct array *aij, ulong s, uint n, ulong ng,
-                                 struct crystal *cr, buffer *bfr) {
-  slong *cols = (slong *)tcalloc(slong, n + aij->n);
-  sint *owner = (sint *)tcalloc(sint, n + aij->n);
-
-  struct mij *ptr = (struct mij *)aij->ptr;
-  for (uint i = 0; i < aij->n; i++) {
-    cols[i] = ptr[i].c;
-    owner[i] = -1;
-  }
-
-  struct comm *c = &cr->comm;
-  for (uint i = 0; i < n; i++) {
-    cols[aij->n + i] = s + i;
-    owner[aij->n + i] = c->id;
-  }
-
-  struct gs_data *gsh = gs_setup(cols, aij->n + n, c, 0, gs_auto, 0);
-  gs(owner, gs_int, gs_max, 0, gsh, bfr);
-  gs_free(gsh);
-
-  for (uint i = 0; i < aij->n; i++) {
-    assert(owner[i] >= 0 && owner[i] < c->np);
-    ptr[i].p = owner[i];
-  }
-
-  free(owner);
-  free(cols);
-
-  sarray_transfer(struct mij, aij, p, 1, cr);
-
-  return 0;
-}
-
-static inline scalar dot(scalar *r, scalar *s, uint n) {
-  scalar t = 0;
-  for (uint i = 0; i < n; i++)
-    t += r[i] * s[i];
-  return t;
-}
-
-static inline void ortho(scalar *q, uint n, ulong ng, struct comm *c) {
-  scalar s = 0, buf;
-  for (uint i = 0; i < n; i++)
-    s += q[i];
-
-  comm_allreduce(c, gs_double, gs_add, &s, 1, &buf);
-  s /= ng;
-
-  for (uint i = 0; i < n; i++)
-    q[i] -= s;
-}
-
-static int schur_action(scalar *y, const struct schur *schur, scalar *x,
-                        ulong ls, scalar *wrk, buffer *bfr, struct comm *c) {
-  const struct par_mat *S = &schur->A_ss;
-  assert(IS_CSR(S));
-  assert(S->rn == 0 || IS_DIAG(S));
-
-  uint ln = schur->A_ll.n, in = S->rn;
-  uint mn = ln > in ? ln : in;
-  scalar *xl = (scalar *)tcalloc(scalar, 2 * mn), *exl = xl + mn;
-
-  metric_tic(c, SCHUR_PROJECT_OPERATOR_FXI);
-  // Calculate (E (B^-1) F) x
-  // Fx: x has size in, Fx has size ln. So wrk has to be at least ln
-  Fxi(exl, &schur->A_ls, schur->Q_ls, x, ls, in, bfr);
-  metric_toc(c, SCHUR_PROJECT_OPERATOR_FXI);
-
-  metric_tic(c, SCHUR_PROJECT_OPERATOR_CHOL);
-  // Multiply Fx by B^-1 or (LU)^-1
-  cholesky_solve(xl, &schur->A_ll, exl);
-  metric_toc(c, SCHUR_PROJECT_OPERATOR_CHOL);
-
-  metric_tic(c, SCHUR_PROJECT_OPERATOR_EZL);
-  // Multuply (B^-1)Fx by E
-  Ezl(exl, &schur->A_sl, schur->Q_sl, xl, ls, in, bfr);
-  metric_toc(c, SCHUR_PROJECT_OPERATOR_EZL);
-
-  metric_tic(c, SCHUR_PROJECT_OPERATOR_MATVEC);
-  // Separately calculate Sx
-  mat_vec_csr(y, x, S, schur->Q_ss, wrk, bfr);
-  metric_toc(c, SCHUR_PROJECT_OPERATOR_MATVEC);
-
-  for (uint i = 0; i < in; i++)
-    y[i] -= exl[i];
-
-  free(xl);
-
-  return 0;
-}
-
-static int project(scalar *x, scalar *b, const struct schur *schur, ulong ls,
-                   struct comm *c, int miter, scalar tol, int null_space,
-                   int verbose, buffer *bfr) {
-  const struct par_mat *S = &schur->A_ss;
-  struct mg *d = schur->M;
-
-  slong out[2][1], buf[2][1], in = S->rn;
-  comm_scan(out, c, gs_long, gs_add, &in, 1, buf);
-  ulong ng = out[1][0];
-
-  if (ng == 0)
-    return 0;
-
-  uint n = S->rn, nnz = n > 0 ? S->adj_off[n] + n : 0;
-  scalar *z = (scalar *)tcalloc(scalar, 6 * n + nnz);
-  scalar *w = z + n, *r = w + n, *p = r + n, *z0 = p + n, *dz = z0 + n;
-  scalar *wrk = dz + n;
-  scalar *P = (scalar *)tcalloc(scalar, 2 * (miter + 1) * n);
-  scalar *W = P + n * (miter + 1);
-
-  uint i;
-  for (i = 0; i < n; i++) {
-    x[i] = 0;
-    r[i] = b[i];
-  }
-
-  scalar rr = dot(r, r, n);
-  comm_allreduce(c, gs_double, gs_add, &rr, 1, buf);
-  scalar rtol = MAX(rr * tol * tol, tol * tol);
-
-  for (i = 0; i < n; i++)
-    z[i] = r[i];
-  if (null_space)
-    ortho(z, n, ng, c);
-
-  scalar rz1 = dot(r, z, n);
-  comm_allreduce(c, gs_double, gs_add, &rz1, 1, buf);
-
-  for (i = 0; i < n; i++)
-    p[i] = z[i];
-
-  scalar alpha, beta, rzt, rz2;
-  uint j, k;
-  for (i = 0; i < miter; i++) {
-    // Action of S - E (LU)^-1 F
-    metric_tic(c, SCHUR_PROJECT_OPERATOR);
-    schur_action(w, schur, p, ls, wrk, bfr, c);
-    metric_toc(c, SCHUR_PROJECT_OPERATOR);
-
-    scalar pw = dot(p, w, n);
-    comm_allreduce(c, gs_double, gs_add, &pw, 1, buf);
-    alpha = rz1 / pw;
-
-    pw = 1.0 / sqrt(pw);
-    for (j = 0; j < n; j++) {
-      W[i * n + j] = pw * w[j];
-      P[i * n + j] = pw * p[j];
-    }
-
-    for (j = 0; j < n; j++) {
-      x[j] += alpha * p[j];
-      r[j] -= alpha * w[j];
-    }
-
-    rr = dot(r, r, n);
-    comm_allreduce(c, gs_double, gs_add, &rr, 1, buf);
-    if (rr < rtol || sqrt(rr) < tol)
-      break;
-
-    for (j = 0; j < n; j++)
-      z0[j] = z[j];
-
-    metric_tic(c, SCHUR_PROJECT_PRECOND);
-#if 1
-    mg_vcycle(z, r, d, c, bfr);
-#else
-    for (j = 0; j < n; j++)
-      z[j] = r[j];
-#endif
-    metric_toc(c, SCHUR_PROJECT_PRECOND);
-
-    if (null_space)
-      ortho(z, n, ng, c);
-    for (j = 0; j < n; j++)
-      dz[j] = z[j] - z0[j];
-
-    // Do the following two reductions together
-    rzt = rz1;
-    rz1 = dot(r, z, n);
-    comm_allreduce(c, gs_double, gs_add, &rz1, 1, buf);
-    rz2 = dot(r, dz, n);
-    comm_allreduce(c, gs_double, gs_add, &rz2, 1, buf);
-
-    if (c->id == 0 && verbose > 0) {
-      printf("i = %u rr = %e rtol = %e rz0 = %e rz1 = %e rz2 = %e\n", i, rr,
-             rtol, rzt, rz1, rz2);
-      fflush(stdout);
-    }
-
-    beta = rz2 / rzt;
-    for (j = 0; j < n; j++)
-      p[j] = z[j] + beta * p[j];
-
-    for (k = 0; k < n; k++)
-      P[miter * n + k] = 0;
-
-    for (j = 0; j <= i; j++) {
-      pw = 0;
-      for (k = 0; k < n; k++)
-        pw += W[j * n + k] * p[k];
-      comm_allreduce(c, gs_double, gs_add, &pw, 1, buf);
-      for (k = 0; k < n; k++)
-        P[miter * n + k] += pw * P[j * n + k];
-    }
-
-    for (k = 0; k < n; k++)
-      p[k] -= P[miter * n + k];
-  }
-
-  free(z);
-  free(P);
-
-  return i == miter ? i : i + 1;
-}
-
-//==============================================================================
-// Dump matrix for debug purposes
-//
-struct mij_t {
-  ulong r, c;
-  scalar v;
-  uint p;
-};
-
-static int append_par_mat(struct array *mijs, const struct par_mat *A) {
-  struct mij_t t = {.r = 0, .c = 0, .v = 0, .p = 0};
-  if (IS_CSR(A)) {
-    for (uint i = 0; i < A->rn; i++) {
-      t.r = A->rows[i];
-      for (uint j = A->adj_off[i]; j < A->adj_off[i + 1]; j++) {
-        t.c = A->cols[A->adj_idx[j]], t.v = A->adj_val[j];
-        array_cat(struct mij_t, mijs, &t, 1);
-      }
-      if (IS_DIAG(A)) {
-        t.c = t.r, t.v = A->diag_val[i];
-        array_cat(struct mij_t, mijs, &t, 1);
-      }
-    }
-  } else if (IS_CSC(A)) {
-    for (uint i = 0; i < A->cn; i++) {
-      t.c = A->cols[i];
-      for (uint j = A->adj_off[i]; j < A->adj_off[i + 1]; j++) {
-        t.r = A->rows[A->adj_idx[j]], t.v = A->adj_val[j];
-        array_cat(struct mij_t, mijs, &t, 1);
-      }
-      if (IS_DIAG(A)) {
-        t.r = t.c, t.v = A->diag_val[i];
-        array_cat(struct mij_t, mijs, &t, 1);
-      }
-    }
-  }
-  return 0;
-}
-
-int schur_dump(const char *name, const struct mat *B,
-               const struct par_mat *A_ls, const struct par_mat *A_sl,
-               const struct par_mat *A_ss, struct crystal *cr, buffer *bfr) {
-  struct comm *c = &cr->comm;
-
-  struct array mijs;
-  array_init(struct mij_t, &mijs, 1000);
-
-  struct mij_t m = {.r = 0, .c = 0, .v = 0, .p = 0};
-  for (uint i = 0; i < B->n; i++) {
-    m.r = B->start + i;
-    for (uint j = B->Lp[i]; j < B->Lp[i + 1]; j++) {
-      m.c = B->start + B->Li[j], m.v = B->L[j];
-      array_cat(struct mij_t, &mijs, &m, 1);
-    }
-    if (B->D != NULL) {
-      m.c = m.r, m.v = B->D[i];
-      array_cat(struct mij_t, &mijs, &m, 1);
-    }
-  }
-
-  append_par_mat(&mijs, A_ls);
-  append_par_mat(&mijs, A_sl);
-  append_par_mat(&mijs, A_ss);
-
-  sarray_transfer(struct mij_t, &mijs, p, 0, cr);
-  sarray_sort_2(struct mij_t, mijs.ptr, mijs.n, r, 1, c, 1, bfr);
-
-  if (c->id == 0 && mijs.n > 0) {
-    FILE *fp = fopen(name, "w");
-    if (fp != NULL) {
-      struct mij_t *pm = (struct mij_t *)mijs.ptr;
-      for (uint i = 0; i < mijs.n; i++)
-        fprintf(fp, "%llu %llu %.15lf\n", pm[i].r, pm[i].c, pm[i].v);
-      fclose(fp);
-    }
-  }
-
-  array_free(&mijs);
-
-  return 0;
-}
-
-//==============================================================================
-// Schur setup
-//
-int schur_setup(struct coarse *crs, struct array *eij, struct crystal *cr,
-                buffer *bfr) {
-  struct comm *c = &cr->comm;
-  comm_barrier(c);
-  double t = comm_time();
-
-  // Setup A_ll
-  struct array ll, ls, sl, ss;
-  array_init(struct mij, &ll, eij->n / 4 + 1);
-  array_init(struct mij, &ls, eij->n / 4 + 1);
-  array_init(struct mij, &sl, eij->n / 4 + 1);
-  array_init(struct mij, &ss, eij->n / 4 + 1);
-
-  struct mij *ptr = (struct mij *)eij->ptr;
-  for (uint i = 0; i < eij->n; i++) {
-    if (ptr[i].r <= crs->ng[0]) {
-      if (ptr[i].c <= crs->ng[0])
-        array_cat(struct mij, &ll, &ptr[i], 1);
-      else
-        array_cat(struct mij, &ls, &ptr[i], 1);
-    } else if (ptr[i].c <= crs->ng[0]) {
-      array_cat(struct mij, &sl, &ptr[i], 1);
-    } else {
-      array_cat(struct mij, &ss, &ptr[i], 1);
-    }
-  }
-
-  t = comm_time() - t;
-  double wrk, min = t, max = t;
-  comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk);
-  comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk);
-  if (c->id == 0) {
-    printf("\tSeparate matrices: %g %g (min max)\n", min, max);
-    fflush(stdout);
-  }
-
-  comm_barrier(c);
-  t = comm_time();
-
-  struct schur *schur = crs->solver = (struct schur *)tcalloc(struct schur, 1);
-
-  // Setup local block diagonal (B). This is distributed by rows based on the
-  // partitioning.
-  struct mat B;
-  csr_setup(&B, &ll, 0, bfr);
-  if (!crs->null_space || (crs->n[1] + crs->n[2] != 0))
-    cholesky_factor(&schur->A_ll, &B, 0, bfr);
-  else
-    cholesky_factor(&schur->A_ll, &B, 1, bfr);
-  schur->A_ll.start = crs->s[0];
-  array_free(&ll);
-
-  min = max = comm_time() - t;
-  comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk);
-  comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk);
-  if (c->id == 0) {
-    printf("\tSetup B          : %g %g (min max)\n", min, max);
-    fflush(stdout);
-  }
-
-  comm_barrier(c);
-  t = comm_time();
-
-  // Setup S: Setup interface nodes. This is distributed by rows in a load
-  // balanced manner.
-  par_csr_setup(&schur->A_ss, &ss, 1, bfr);
-  array_free(&ss);
-  schur->Q_ss = setup_Q(&schur->A_ss, &cr->comm, bfr);
-
-  min = max = comm_time() - t;
-  comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk);
-  comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk);
-  if (c->id == 0) {
-    printf("\tSetup S          : %g %g (min max)\n", min, max);
-    fflush(stdout);
-  }
-
-  comm_barrier(c);
-  t = comm_time();
-
-  // Setup F: Setup local interface connectivity. This is distributed by rows
-  // similar to B.
-  par_csr_setup(&schur->A_ls, &ls, 0, bfr);
-  array_free(&ls);
-  schur->Q_ls = setup_Fxi_Q(&schur->A_ls, crs->s[1], crs->n[1], &cr->comm, bfr);
-
-  min = max = comm_time() - t;
-  comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk);
-  comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk);
-  if (c->id == 0) {
-    printf("\tSetup F          : %g %g (min max)\n", min, max);
-    fflush(stdout);
-  }
-
-  comm_barrier(c);
-  t = comm_time();
-
-  // Setup E: E is distributed by columns in the same manner as columns (or
-  // rows) of B.
-  distribute_by_columns(&sl, crs->s[0], crs->n[0], crs->ng[0], cr, bfr);
-  par_csc_setup(&schur->A_sl, &sl, 0, bfr);
-  array_free(&sl);
-  schur->Q_sl = setup_Ezl_Q(&schur->A_sl, crs->s[1], crs->n[1], &cr->comm, bfr);
-
-  min = max = comm_time() - t;
-  comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk);
-  comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk);
-  if (c->id == 0) {
-    printf("\tSetup E          : %g %g (min max)\n", min, max);
-    fflush(stdout);
-  }
-
-  comm_barrier(c);
-  t = comm_time();
-
-  // Setup the preconditioner for the Schur complement matrix
-  schur->M = schur_precond_setup(&schur->A_ll, &schur->A_ls, &schur->A_ss,
-                                 &schur->A_sl, crs->s[1], crs->n[1], cr, bfr);
-
-  min = max = comm_time() - t;
-  comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk);
-  comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk);
-  if (c->id == 0) {
-    printf("\tSetup MG Precond : %g %g (min max)\n", min, max);
-    fflush(stdout);
-  }
-
-  return 0;
-}
-
-int schur_solve(scalar *x, struct coarse *crs, scalar *b, scalar tol,
-                buffer *bfr) {
-  struct comm *c = &crs->c;
-  struct schur *schur = crs->solver;
-
-  uint ln = crs->n[0], in = crs->n[1];
-  scalar *rhs = (scalar *)tcalloc(scalar, ln > in ? ln : in);
-  scalar *zl = (scalar *)tcalloc(scalar, ln);
-  scalar *xl = (scalar *)tcalloc(scalar, in + ln), *xi = xl + ln;
-
-  // Solve: A_ll z_l = r_l
-  for (uint i = 0; i < ln; i++)
-    rhs[i] = b[i];
-
-  metric_tic(c, SCHUR_SOLVE_CHOL1);
-  cholesky_solve(zl, &schur->A_ll, rhs);
-  if (crs->null_space && (crs->n[1] + crs->n[2]) == 0)
-    zl[ln - 1] = 0;
-  metric_toc(c, SCHUR_SOLVE_CHOL1);
-
-  metric_tic(c, SCHUR_SOLVE_SETRHS1);
-  // Solve: A_ss x_i = fi where fi = r_i - E zl
-  Ezl(rhs, &schur->A_sl, schur->Q_sl, zl, crs->s[0], in, bfr);
-  for (uint i = 0; i < in; i++)
-    rhs[i] = b[ln + i] - rhs[i];
-  metric_toc(c, SCHUR_SOLVE_SETRHS1);
-
-  metric_tic(c, SCHUR_SOLVE_PROJECT);
-  unsigned miter = (tol < 0 ? fabs(tol) : 100);
-  scalar mtol = (tol > 0 ? tol : 1e-7);
-  int iter = project(xi, rhs, schur, crs->s[0], c, miter, mtol, 0, 1, bfr);
-  metric_toc(c, SCHUR_SOLVE_PROJECT);
-  metric_acc(SCHUR_PROJECT_NITER, iter);
-
-  // Solve A_ll xl = fl where fl = r_l - F xi
-  metric_tic(c, SCHUR_SOLVE_SETRHS2);
-  for (uint i = 0; i < ln; i++)
-    rhs[i] = 0;
-  Fxi(rhs, &schur->A_ls, schur->Q_ls, xi, crs->s[0], in, bfr);
-  for (uint i = 0; i < ln; i++)
-    rhs[i] = b[i] - rhs[i];
-  metric_toc(c, SCHUR_SOLVE_SETRHS2);
-
-  metric_tic(c, SCHUR_SOLVE_CHOL2);
-  cholesky_solve(xl, &schur->A_ll, rhs);
-  if (crs->null_space && (crs->n[1] + crs->n[2]) == 0)
-    xl[ln - 1] = 0;
-  metric_toc(c, SCHUR_SOLVE_CHOL2);
-
-  for (uint i = 0; i < ln + in; i++)
-    x[i] = xl[i];
-
-  if (crs->null_space) {
-    scalar sum = 0, wrk;
-    for (uint i = 0; i < ln + in; i++)
-      sum += x[i];
-    comm_allreduce(c, gs_double, gs_add, &sum, 1, &wrk);
-    sum = sum / (crs->ng[0] + crs->ng[1] + crs->ng[2]);
-    for (uint i = 0; i < ln + in; i++)
-      x[i] -= sum;
-  }
-
-  free(rhs), free(zl), free(xl);
-
-  return 0;
-}
-
-int schur_free(struct coarse *crs) {
-  struct schur *schur = (struct schur *)crs->solver;
-  if (schur != NULL) {
-    mat_free(&schur->A_ll);
-    par_mat_free(&schur->A_ls);
-    if (schur->Q_ls != NULL)
-      gs_free(schur->Q_ls), schur->Q_ls = NULL;
-    par_mat_free(&schur->A_sl);
-    if (schur->Q_sl != NULL)
-      gs_free(schur->Q_sl), schur->Q_sl = NULL;
-    par_mat_free(&schur->A_ss);
-    if (schur->Q_ss != NULL)
-      gs_free(schur->Q_ss), schur->Q_ss = NULL;
-    if (schur->M != NULL)
-      mg_free(schur->M), schur->M = NULL;
-    free(schur), schur = NULL;
-  }
-
-  return 0;
-}
-
-#undef MAX
diff --git a/src/sort-bin.c b/src/sort-bin.c
new file mode 100644
index 00000000..cb6c6d9e
--- /dev/null
+++ b/src/sort-bin.c
@@ -0,0 +1,52 @@
+#include "sort-impl.h"
+
+static uint *set_proc_from_val(struct sort *s, uint field,
+                               const struct comm *c) {
+  struct array *a = s->a;
+  gs_dom t = s->t[field];
+  uint offset = s->offset[field];
+
+  double extrema[2];
+  get_extrema((void *)extrema, s, field, c);
+  double range = extrema[1] - extrema[0];
+
+  uint size = a->n;
+  if (size == 0)
+    return NULL;
+  uint *proc = tcalloc(uint, size);
+
+  uint np = c->np;
+  assert(np > 0);
+  uint id = 0, index = 0;
+  do {
+    double end = extrema[0] + (range / np) * (id + 1);
+    while (index < size) {
+      double val = get_scalar(a, index, offset, s->unit_size, t);
+      if (val <= end)
+        proc[index] = id, index++;
+      else
+        break;
+    }
+    id++;
+  } while (id < np && index < size);
+  for (; index < size; index++)
+    proc[index] = np - 1;
+
+  return proc;
+}
+
+void parallel_bin_sort(struct sort *s, const struct comm *c) {
+  // Locally sort the array first.
+  sort_local(s);
+
+  // Set destination bin based on the field value.
+  uint *proc = set_proc_from_val(s, 0, c);
+
+  // Transfer the array in chunks.
+  sarray_transfer_chunk(s->a, s->unit_size, proc, c);
+  free(proc);
+
+  // Locally sort again to make sure that we have both globally and locally
+  // sorted array.
+  sort_local(s);
+}
diff --git a/src/sort-hypercube.c b/src/sort-hypercube.c
new file mode 100644
index 00000000..d8f01a52
--- /dev/null
+++ b/src/sort-hypercube.c
@@ -0,0 +1,150 @@
+#include "sort-impl.h"
+#include <math.h>
+
+struct hypercube {
+  struct sort *data;
+  int nprobes;
+  double *probes;
+  ulong *probe_cnt;
+};
+
+static void init_probes(struct hypercube *data, const struct comm *c) {
+  // Allocate space for probes and counts.
+  int nprobes = data->nprobes = 3;
+  if (!data->probes)
+    data->probes = tcalloc(double, nprobes);
+  if (!data->probe_cnt)
+    data->probe_cnt = tcalloc(ulong, nprobes);
+
+  double extrema[2];
+  get_extrema((void *)extrema, data->data, 0, c);
+  double range = extrema[1] - extrema[0];
+  double delta = range / (nprobes - 1);
+
+  data->probes[0] = extrema[0];
+  data->probes[1] = extrema[0] + delta;
+  data->probes[2] = extrema[1];
+}
+
+static void update_probe_counts(struct hypercube *data, const struct comm *c) {
+  struct sort *input = data->data;
+  uint offset = input->offset[0];
+  gs_dom t = input->t[0];
+
+  uint nprobes = data->nprobes;
+  for (uint i = 0; i < nprobes; i++)
+    data->probe_cnt[i] = 0;
+
+  struct array *a = input->a;
+  for (uint e = 0; e < a->n; e++) {
+    double val = get_scalar(a, e, offset, input->unit_size, t);
+    for (uint i = 0; i < nprobes; i++) {
+      if (val < data->probes[i])
+        data->probe_cnt[i]++;
+    }
+  }
+
+  slong wrk[6];
+  comm_allreduce(c, gs_long, gs_add, data->probe_cnt, nprobes, wrk);
+}
+
+static void update_probes(slong nelem, double *probes, ulong *probe_cnt,
+                          uint threshold) {
+  assert(nelem >= 0);
+  slong expected = nelem / 2;
+  if (llabs(expected - (slong)probe_cnt[1]) < threshold)
+    return;
+
+  if (probe_cnt[1] < (ulong)expected)
+    probes[0] = probes[1];
+  else
+    probes[2] = probes[1];
+
+  probes[1] = probes[0] + (probes[2] - probes[0]) / 2.0;
+}
+
+static void transfer_elem(const struct hypercube *data, const struct comm *c) {
+  struct sort *input = data->data;
+  uint usize = input->unit_size;
+  uint offset = input->offset[0];
+  gs_dom t = input->t[0];
+  struct array *a = input->a;
+
+  uint size = a->n, lown = 0, uppern = 0;
+  for (uint e = 0; e < size; e++) {
+    double val = get_scalar(a, e, offset, usize, t);
+    if (val < data->probes[1])
+      lown++;
+    else
+      uppern++;
+  }
+
+  slong out[2][2], in[2] = {lown, uppern}, wrk[2][2];
+  comm_scan(out, c, gs_long, gs_add, in, 2, wrk);
+  slong lstart = out[0][0], ustart = out[0][1];
+  slong lelem = out[1][0], uelem = out[1][1];
+
+  uint np = c->np, lnp = np / 2;
+  uint *proc1 = set_proc_from_idx(lnp, lstart, lown, lelem);
+  uint *proc2 = set_proc_from_idx(np - lnp, ustart, uppern, uelem);
+  proc1 = trealloc(uint, proc1, size);
+  for (uint e = lown; e < size; e++)
+    proc1[e] = proc2[e - lown] + lnp;
+
+  sarray_transfer_chunk(a, usize, proc1, c);
+  free(proc1), free(proc2);
+}
+
+// TODO: Get rid of this recursive implementation.
+static void parallel_hypercube_sort_aux(struct hypercube *data,
+                                        const struct comm *c) {
+  struct sort *input = data->data;
+  struct array *a = input->a;
+
+  // FIXME: Replace comm_scan() by comm_allreduce().
+  slong out[2][1], buf[2][1], in = a->n;
+  comm_scan(out, c, gs_long, gs_add, &in, 1, buf);
+  slong nelem = out[1][0];
+
+  uint threshold = nelem / (10 * c->np);
+  if (threshold < 2)
+    threshold = 2;
+
+  sort_local(data->data);
+
+  if (c->np == 1)
+    return;
+
+  init_probes(data, c);
+  update_probe_counts(data, c);
+  int max_iter = log2((data->probes[2] - data->probes[0]) / 1e-12);
+  int iter = 0;
+  while (llabs(nelem / 2 - (slong)data->probe_cnt[1]) > threshold &&
+         iter++ < max_iter) {
+    update_probes(nelem, data->probes, data->probe_cnt, threshold);
+    update_probe_counts(data, c);
+  }
+
+  transfer_elem(data, c);
+
+  // split the communicator
+  struct comm nc;
+  sint lower = (c->id < c->np / 2);
+  comm_split(c, lower, c->id, &nc);
+
+  // TODO: Keep load balancing after each split
+  parallel_hypercube_sort_aux(data, &nc);
+
+  comm_free(&nc);
+}
+
+void parallel_hypercube_sort(struct sort *sd, const struct comm *c) {
+  struct comm dup;
+  comm_dup(&dup, c);
+
+  struct hypercube hdata = {.data = sd, .probes = NULL, .probe_cnt = NULL};
+  parallel_hypercube_sort_aux(&hdata, &dup);
+  free(hdata.probes), free(hdata.probe_cnt);
+
+  comm_free(&dup);
+}
diff --git a/src/sort-impl.h b/src/sort-impl.h
new file mode 100644
index 00000000..47a52100
--- /dev/null
+++ b/src/sort-impl.h
@@ -0,0 +1,34 @@
+#ifndef _PARRSB_SORT_IMPL_H_
+#define _PARRSB_SORT_IMPL_H_
+
+#include "sort.h"
+
+double get_scalar(struct array *a, uint i, uint offset, uint usize,
+                  gs_dom type);
+
+uint *set_proc_from_idx(uint size, sint np, slong start, slong nelem);
+
+void sarray_transfer_chunk(struct array *arr, const size_t usize,
+                           const uint *proc, const struct comm *c);
+
+struct sort {
+  struct array *a;
+  size_t unit_size, align;
+
+  int nfields;
+  gs_dom t[3];
+  uint offset[3];
+
+  buffer *buf;
+};
+
+void sort_local(struct sort *s);
+
+void get_extrema(void *extrema_, struct sort *data, uint field,
+                 const struct comm *c);
+
+void parallel_hypercube_sort(struct sort *s, const struct comm *c);
+
+void parallel_bin_sort(struct sort *s, const struct comm *c);
+
+#endif // _PARRSB_SORT_IMPL_H_
diff --git a/src/sort.c b/src/sort.c
index 5de28e61..93b0adb5 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -1,9 +1,12 @@
-#include "sort.h"
+#include "sort-impl.h"
 #include <float.h>
 #include <math.h>
 
-static double get_scalar(struct array *a, uint i, uint offset, uint usize,
-                         gs_dom type) {
+extern void parrsb_print(const struct comm *c, int verbose, const char *fmt,
+                         ...);
+
+double get_scalar(struct array *a, uint i, uint offset, uint usize,
+                  gs_dom type) {
   char *v = (char *)a->ptr + i * usize + offset;
 
   double data;
@@ -18,14 +21,16 @@ static double get_scalar(struct array *a, uint i, uint offset, uint usize,
     data = *((double *)v);
     break;
   default:
+    fprintf(stderr, "Error: Unknown type %d\n", type);
+    exit(EXIT_FAILURE);
     break;
   }
 
   return data;
 }
 
-static void get_extrema(void *extrema_, struct sort *data, uint field,
-                        const struct comm *c) {
+void get_extrema(void *extrema_, struct sort *data, uint field,
+                 const struct comm *c) {
   struct array *a = data->a;
   uint usize = data->unit_size;
   uint offset = data->offset[field];
@@ -41,70 +46,35 @@ static void get_extrema(void *extrema_, struct sort *data, uint field,
     extrema[1] = get_scalar(a, size - 1, offset, usize, t);
   }
 
-  double buf[2];
+  double buf[4];
   comm_allreduce(c, gs_double, gs_max, extrema, 2, buf);
   extrema[0] *= -1;
 }
 
-static int set_dest(uint *proc, uint size, sint np, slong start, slong nelem) {
+uint *set_proc_from_idx(uint size, sint np_, slong start, slong nelem) {
   if (nelem == 0)
-    return 1;
+    return NULL;
+  uint *proc = tcalloc(uint, size + 1);
 
-  uint nelt = nelem / np, nrem = nelem - np * nelt;
+  ulong np = np_;
+  ulong nelt = nelem / np, nrem = nelem - np * nelt;
+  assert(nrem < np);
   if (nrem == 0) {
-    for (uint i = 0; i < size; i++) {
-      proc[i] = (start + i) / nelt;
-    }
+    for (uint i = 0; i < size; i++)
+      proc[i] = (uint)((start + i) / nelt);
   } else {
-    uint s = np - nrem;
-    slong t = nelt * s;
+    ulong s = np - nrem;
+    ulong t1 = nelt * s;
     for (uint i = 0; i < size; i++) {
-      if (start + i < t)
-        proc[i] = (start + i) / nelt;
+      ulong spi = start + i;
+      if (spi < t1)
+        proc[i] = (uint)(spi / nelt);
       else
-        proc[i] = s + (start + i - t) / (nelt + 1);
+        proc[i] = (uint)s + (uint)((spi - t1) / (nelt + 1));
     }
   }
 
-  return 0;
-}
-
-//-----------------------------------------------------------------------------
-// Parallel Bin-Sort
-//
-static int set_bin(uint **proc_, struct sort *s, uint field,
-                   const struct comm *c) {
-  struct array *a = s->a;
-  gs_dom t = s->t[field];
-  uint offset = s->offset[field];
-
-  uint size = a->n;
-  uint *proc = *proc_ = tcalloc(uint, size);
-
-  double extrema[2];
-  get_extrema((void *)extrema, s, field, c);
-  double range = extrema[1] - extrema[0];
-
-  if (size == 0)
-    return 0;
-
-  sint np = c->np;
-  uint id = 0;
-  uint index = 0;
-  do {
-    double end = extrema[0] + (range / np) * (id + 1);
-    while (index < size) {
-      double val = get_scalar(a, index, offset, s->unit_size, t);
-      if (val <= end)
-        proc[index++] = id;
-      else
-        break;
-    }
-    id++;
-  } while (id < np && index < size);
-  for (; index < size; index++)
-    proc[index] = np - 1;
-  return 0;
+  return proc;
 }
 
 static int sort_field(struct array *arr, size_t usize, gs_dom t, uint off,
@@ -131,7 +101,7 @@ static int sort_field(struct array *arr, size_t usize, gs_dom t, uint off,
   return 0;
 }
 
-int sort_local(struct sort *s) {
+void sort_local(struct sort *s) {
   struct array *a = s->a;
   buffer *buf = s->buf;
   size_t usize = s->unit_size;
@@ -141,247 +111,120 @@ int sort_local(struct sort *s) {
   while (i >= 0)
     sort_field(a, usize, s->t[i], s->offset[i], buf, 1), i--;
   sarray_permute_buf_(s->align, usize, a->ptr, a->n, buf);
-
-  return 0;
 }
 
-static int parallel_bin_sort(struct sort *s, const struct comm *c) {
-  // Local sort
-  sort_local(s);
-
-  // Set destination bin
-  uint *proc;
-  set_bin(&proc, s, 0, c);
-
-  // Transfer to destination processor
-  struct crystal cr;
-  crystal_init(&cr, c);
-  sarray_transfer_ext_(s->a, s->unit_size, proc, sizeof(uint), &cr);
-  crystal_free(&cr);
+static int load_balance(struct array *a, size_t size, const struct comm *c) {
+  slong out[2][1], wrk[2][1], in = a->n;
+  comm_scan(out, c, gs_long, gs_add, &in, 1, wrk);
+  slong start = out[0][0], nelem = out[1][0];
 
+  parrsb_print(c, 0, "\t\t\tstart = %lld, nelem = %lld", start, nelem);
+  uint *proc = set_proc_from_idx(a->n, c->np, start, nelem);
+  sarray_transfer_chunk(a, size, proc, c);
   free(proc);
 
-  // Locally sort again
-  sort_local(s);
-
-  return 0;
-}
-
-//-----------------------------------------------------------------------------
-// Parallel Hypercube-Sort
-//
-struct hypercube {
-  struct sort *data;
-  int nprobes;
-  double *probes;
-  ulong *probe_cnt;
-};
-
-static int init_probes(struct hypercube *data, struct comm *c) {
-  struct sort *input = data->data;
-
-  // Allocate space for probes and counts
-  int nprobes = data->nprobes = 3;
-  if (!data->probes)
-    data->probes = tcalloc(double, nprobes);
-  if (!data->probe_cnt)
-    data->probe_cnt = tcalloc(ulong, nprobes);
-
-  double extrema[2];
-  get_extrema((void *)extrema, data->data, 0, c);
-  double range = extrema[1] - extrema[0];
-  double delta = range / (nprobes - 1);
-
-  data->probes[0] = extrema[0];
-  data->probes[1] = extrema[0] + delta;
-  data->probes[2] = extrema[1];
-
-  return 0;
-}
-
-static int update_probe_counts(struct hypercube *data, struct comm *c) {
-  struct sort *input = data->data;
-  uint offset = input->offset[0];
-  gs_dom t = input->t[0];
-
-  uint nprobes = data->nprobes;
-  uint i;
-  for (i = 0; i < nprobes; i++)
-    data->probe_cnt[i] = 0;
-
-  struct array *a = input->a;
-  uint e;
-  for (e = 0; e < a->n; e++) {
-    double val_e = get_scalar(a, e, offset, input->unit_size, t);
-    for (i = 0; i < nprobes; i++)
-      if (val_e < data->probes[i])
-        data->probe_cnt[i]++;
-  }
-
-  ulong buf[3];
-  comm_allreduce(c, gs_long, gs_add, data->probe_cnt, nprobes, buf);
-
-  return 0;
-}
-
-static int update_probes(slong nelem, double *probes, ulong *probe_cnt,
-                         uint threshold) {
-  slong expected = nelem / 2;
-  if (llabs(expected - (slong)probe_cnt[1]) < threshold)
-    return 0;
-
-  if (probe_cnt[1] < expected)
-    probes[0] = probes[1];
-  else
-    probes[2] = probes[1];
-
-  probes[1] = probes[0] + (probes[2] - probes[0]) / 2.0;
-
   return 0;
 }
 
-static int transfer_elem(struct hypercube *data, struct comm *c) {
-  struct sort *input = data->data;
-  uint usize = input->unit_size, offset = input->offset[0];
-  gs_dom t = input->t[0];
-  struct array *a = input->a;
-
-  uint size = a->n, lown = 0, uppern = 0;
-  for (uint e = 0; e < size; e++) {
-    double val = get_scalar(a, e, offset, usize, t);
-    if (val < data->probes[1])
-      lown++;
-    else
-      uppern++;
-  }
-
-  slong out[2][2], in[2] = {lown, uppern}, buf[2][2];
-  comm_scan(out, c, gs_long, gs_add, in, 2, buf);
-  slong lstart = out[0][0], ustart = out[0][1];
-  slong lelem = out[1][0], uelem = out[1][1];
-
-  uint np = c->np, lnp = np / 2;
-  uint *proc = tcalloc(uint, size);
-  set_dest(proc, lnp, lstart, lown, lelem);
-  set_dest(proc + lown, np - lnp, ustart, uppern, uelem);
-
-  for (uint e = lown; e < size; e++)
-    proc[e] += lnp;
+void sarray_transfer_chunk(struct array *arr, const size_t usize,
+                           const uint *proci, const struct comm *c) {
+  // Calculate the global array size. If it is zero, nothing to do, just return.
+  slong ng = arr->n, wrk[2];
+  comm_allreduce(c, gs_long, gs_add, &ng, 1, wrk);
+  if (ng == 0)
+    return;
 
+  // Initialize the crystal router.
   struct crystal cr;
   crystal_init(&cr, c);
-  sarray_transfer_ext_(a, usize, proc, sizeof(uint), &cr);
-  crystal_free(&cr);
-
-  free(proc);
-
-  return 0;
-}
-
-static int parallel_hypercube_sort(struct hypercube *data, struct comm *c) {
-  struct sort *input = data->data;
-  struct array *a = input->a;
-  gs_dom t = input->t[0];
-  uint offset = input->offset[0];
-
-  sint size = c->np, rank = c->id;
-
-  slong out[2][1], buf[2][1], in = a->n;
-  comm_scan(out, c, gs_long, gs_add, &in, 1, buf);
-  slong start = out[0][0];
-  slong nelem = out[1][0];
-
-  uint threshold = nelem / (10 * size);
-  if (threshold < 2)
-    threshold = 2;
 
-  sort_local(data->data);
-
-  if (size == 1)
-    return 0;
-
-  init_probes(data, c);
-  update_probe_counts(data, c);
-
-  int max_iter = log2((data->probes[2] - data->probes[0]) / 1e-12);
-  int iter = 0;
-  while (llabs(nelem / 2 - (slong)data->probe_cnt[1]) > threshold &&
-         iter++ < max_iter) {
-    update_probes(nelem, data->probes, data->probe_cnt, threshold);
-    update_probe_counts(data, c);
+  // Allocate `proc` with some buffer space.
+  uint *proc = tcalloc(uint, arr->n + 1);
+  for (uint i = 0; i < arr->n; i++)
+    proc[i] = proci[i];
+
+  // Transfer the array elements to destination processor. To avoid message
+  // sizes larger than INT_MAX, we calculate total message size and then figure
+  // out how many transfers we need. Then we transfer array using that many
+  // transfers.
+  slong msg_size = 9 * (INT_MAX / 10);
+  uint nt = (ng * usize + msg_size - 1) / msg_size;
+  parrsb_print(c, 0, "\t\t\tmsg_size = %lld, nt = %u", msg_size, nt);
+  uint tsize = (arr->n + nt - 1) / nt;
+
+  struct array brr, crr;
+  array_init_(&brr, tsize + 1, usize, __FILE__, __LINE__);
+  array_init_(&crr, arr->n + 1, usize, __FILE__, __LINE__);
+
+  char *pe = (char *)arr->ptr;
+  uint off = 0, off1;
+  for (unsigned t = 0; t < nt; t++) {
+    // Copy a chunk from `arr` to `brr`.
+    brr.n = 0, off1 = off + tsize;
+    assert(off <= arr->n);
+    for (uint j = off; j < arr->n && j < off1; j++)
+      array_cat_(usize, &brr, &pe[j * usize], 1, __FILE__, __LINE__);
+
+    // Transfer the chunk in `brr` to the destination.
+    sarray_transfer_ext_(&brr, usize, &proc[off], sizeof(uint), &cr);
+
+    // Append the received chunk to `crr`.
+    array_cat_(usize, &crr, brr.ptr, brr.n, __FILE__, __LINE__);
+    off = (off1 < arr->n ? off1 : arr->n);
+
+    // Some debug printing.
+    slong cmax = crr.n, bmax = brr.n, cmin = crr.n, bmin = brr.n;
+    comm_allreduce(c, gs_long, gs_max, &cmax, 1, wrk);
+    comm_allreduce(c, gs_long, gs_max, &bmax, 1, wrk);
+    comm_allreduce(c, gs_long, gs_min, &cmin, 1, wrk);
+    comm_allreduce(c, gs_long, gs_min, &bmin, 1, wrk);
+    parrsb_print(c, 0, "\t\t\t %d/%d brr.n = %u/%lld/%lld crr.n = %u/%lld/%lld",
+                 t, nt, brr.n, bmin, bmax, crr.n, cmin, cmax);
   }
+  array_free(&brr);
 
-  transfer_elem(data, c);
-
-  // split the communicator
-  struct comm nc;
-  sint lower = (rank < size / 2) ? 1 : 0;
-#if defined(MPI)
-  MPI_Comm nc_;
-  MPI_Comm_split(c->c, lower, rank, &nc_);
-  comm_init(&nc, nc_);
-  MPI_Comm_free(&nc_);
-#else
-  comm_init(&nc, 1);
-#endif
+  arr->n = 0;
+  array_cat_(usize, arr, crr.ptr, crr.n, __FILE__, __LINE__);
+  array_free(&crr);
 
-  // TODO: Keep load balancing after each split
-  parallel_hypercube_sort(data, &nc);
-  comm_free(&nc);
-
-  return 0;
-}
-
-static int load_balance(struct array *a, size_t size, const struct comm *c,
-                        struct crystal *cr) {
-  slong out[2][1], buf[2][1], in = a->n;
-  comm_scan(out, c, gs_long, gs_add, &in, 1, buf);
-  slong start = out[0][0], nelem = out[1][0];
-
-  uint *proc = tcalloc(uint, a->n);
-  set_dest(proc, a->n, c->np, start, nelem);
-  sarray_transfer_ext_(a, size, proc, sizeof(uint), cr);
   free(proc);
-
-  return 0;
+  crystal_free(&cr);
 }
 
-int parallel_sort_private(struct sort *data, const struct comm *c) {
-  struct comm dup;
-  comm_dup(&dup, c);
-
-  int balance = data->balance, algo = data->algo;
-
-  struct array *a = data->a;
-  size_t usize = data->unit_size;
+void parallel_sort_(struct array *arr, size_t usize, size_t align,
+                    unsigned algo, unsigned balance, const struct comm *c,
+                    buffer *bfr, unsigned nfields, ...) {
+  struct sort sd = {.a = arr, .unit_size = usize, .align = align};
+  sd.buf = bfr;
+  sd.nfields = nfields;
+
+  va_list vargs;
+  va_start(vargs, nfields);
+  for (uint i = 0; i < nfields; i++) {
+    sd.t[i] = va_arg(vargs, gs_dom);
+    sd.offset[i] = va_arg(vargs, size_t);
+  }
+  va_end(vargs);
 
-  struct hypercube hdata;
+  // If there is only a single MPI process, just sort locally and return.
+  if (c->np == 1) {
+    sort_local(&sd);
+    return;
+  }
 
   switch (algo) {
   case 0:
-    parallel_bin_sort(data, c);
+    parallel_bin_sort(&sd, c);
     break;
   case 1:
-    hdata.data = data;
-    hdata.probes = NULL;
-    hdata.probe_cnt = NULL;
-    parallel_hypercube_sort(&hdata, &dup);
-    free(hdata.probes);
-    free(hdata.probe_cnt);
+    parallel_hypercube_sort(&sd, c);
     break;
   default:
     break;
   }
 
   if (balance) {
-    struct crystal cr;
-    crystal_init(&cr, c);
-    load_balance(a, usize, c, &cr);
-    crystal_free(&cr);
-    sort_local(data);
+    load_balance(sd.a, sd.unit_size, c);
+    sort_local(&sd);
   }
-
-  comm_free(&dup);
-
-  return 0;
 }
diff --git a/src/sort.h b/src/sort.h
index b2e83cd2..5e8f150f 100644
--- a/src/sort.h
+++ b/src/sort.h
@@ -2,54 +2,18 @@
 #define _PARRSB_SORT_H_
 
 #include <gslib.h>
+#include <stdarg.h>
 
-struct sort {
-  int balance, algo;
+void parallel_sort_(struct array *arr, size_t usize, size_t align,
+                    unsigned algo, unsigned balance, const struct comm *c,
+                    buffer *bfr, unsigned nfields, ...);
 
-  int nfields;
-  gs_dom t[3];
-  uint offset[3];
+#define parallel_sort(T, A, field, type, algo, balance, c, bfr)                \
+  parallel_sort_(A, sizeof(T), ALIGNOF(T), algo, balance, c, bfr, 1, type,     \
+                 offsetof(T, field))
 
-  struct array *a;
-  size_t unit_size, align;
-
-  buffer *buf;
-};
-
-int sort_local(struct sort *s);
-int parallel_sort_private(struct sort *s, const struct comm *c);
-
-// Uniform parallel sort
-#define parallel_sort(T, A, field, type, method, loadbalance, c, bufp)         \
-  do {                                                                         \
-    struct sort sd;                                                            \
-    sd.unit_size = sizeof(T);                                                  \
-    sd.align = ALIGNOF(T);                                                     \
-    sd.nfields = 1;                                                            \
-    sd.t[0] = type;                                                            \
-    sd.offset[0] = offsetof(T, field);                                         \
-    sd.a = A;                                                                  \
-    sd.algo = method;                                                          \
-    sd.balance = loadbalance;                                                  \
-    sd.buf = bufp;                                                             \
-    parallel_sort_private(&sd, c);                                             \
-  } while (0)
-
-#define parallel_sort_2(T, A, f1, t1, f2, t2, method, loadbalance, c, bufp)    \
-  do {                                                                         \
-    struct sort sd;                                                            \
-    sd.unit_size = sizeof(T);                                                  \
-    sd.align = ALIGNOF(T);                                                     \
-    sd.nfields = 2;                                                            \
-    sd.t[0] = t1;                                                              \
-    sd.offset[0] = offsetof(T, f1);                                            \
-    sd.t[1] = t2;                                                              \
-    sd.offset[1] = offsetof(T, f2);                                            \
-    sd.a = A;                                                                  \
-    sd.algo = method;                                                          \
-    sd.balance = loadbalance;                                                  \
-    sd.buf = bufp;                                                             \
-    parallel_sort_private(&sd, c);                                             \
-  } while (0)
+#define parallel_sort_2(T, A, f1, t1, f2, t2, algo, balance, c, bfr)           \
+  parallel_sort_(A, sizeof(T), ALIGNOF(T), algo, balance, c, bfr, 2, t1,       \
+                 offsetof(T, f1), t2, offsetof(T, f2))
 
 #endif
diff --git a/src/statistics.c b/src/statistics.c
new file mode 100644
index 00000000..f8f8697a
--- /dev/null
+++ b/src/statistics.c
@@ -0,0 +1,230 @@
+#include "parrsb-impl.h"
+
+#include <float.h>
+#include <string.h>
+
+static uint get_partition(const struct comm *const gc,
+                          const struct comm *const lc) {
+  // Find the partition id. A partition is a group of processors sharing the
+  // same local communicator.
+  sint out[2][1], wrk[2][1], root = (lc->id == 0);
+  comm_scan(out, gc, gs_int, gs_add, &root, 1, wrk);
+  sint part = out[0][0] * (lc->id == 0);
+  comm_allreduce(lc, gs_int, gs_max, &part, 1, wrk);
+  return part;
+}
+
+uint parrsb_get_neighbors(const struct array *const elems, const unsigned nv,
+                          const struct comm *const gc,
+                          const struct comm *const lc, buffer *bfr) {
+  const uint n = elems->n;
+  const uint size = elems->n * nv;
+
+  struct vertex_t {
+    ulong v;
+    uint p, partition;
+  };
+
+  struct array vertices;
+  array_init(struct vertex_t, &vertices, size);
+
+  const struct rsb_element *const pe =
+      (const struct rsb_element *const)elems->ptr;
+  struct vertex_t vt = {.partition = get_partition(gc, lc)};
+  for (uint i = 0; i < n; i++) {
+    for (uint v = 0; v < nv; v++) {
+      vt.v = pe[i].vertices[v], vt.p = vt.v % gc->np;
+      array_cat(struct vertex_t, &vertices, &vt, 1);
+    }
+  }
+
+  struct crystal cr;
+  crystal_init(&cr, gc);
+
+  sarray_transfer(struct vertex_t, &vertices, p, 1, &cr);
+  sarray_sort(struct vertex_t, vertices.ptr, vertices.n, v, 1, bfr);
+
+  struct array neighbors;
+  array_init(struct vertex_t, &neighbors, vertices.n * 27);
+
+  const struct vertex_t *const pv = (const struct vertex_t *const)vertices.ptr;
+  uint s = 0;
+  while (s < vertices.n) {
+    uint e = s + 1;
+    while (e < vertices.n && pv[s].v == pv[e].v)
+      e++;
+    for (uint i = s; i < e; i++) {
+      struct vertex_t vt = pv[i];
+      for (uint j = s; j < e; j++) {
+        vt.partition = pv[j].partition;
+        array_cat(struct vertex_t, &neighbors, &vt, 1);
+      }
+    }
+    s = e;
+  }
+  array_free(&vertices);
+
+  sarray_transfer(struct vertex_t, &neighbors, p, 0, &cr);
+  crystal_free(&cr);
+  sarray_sort(struct vertex_t, neighbors.ptr, neighbors.n, partition, 0, bfr);
+
+  // Now, extract out different partition ids found locally into an array.
+  struct unique_t {
+    uint p, partition;
+  };
+
+  struct array unique;
+  array_init(struct unique_t, &unique, 27);
+
+  if (neighbors.n > 0) {
+    const struct vertex_t *const pn =
+        (const struct vertex_t *const)neighbors.ptr;
+    struct unique_t ut = {.partition = pn[0].partition,
+                          .p = pn[0].partition % lc->np};
+    array_cat(struct unique_t, &unique, &ut, 1);
+    for (uint i = 1; i < neighbors.n; i++) {
+      if (pn[i].partition > pn[i - 1].partition) {
+        ut.partition = pn[i].partition, ut.p = ut.partition % lc->np;
+        array_cat(struct unique_t, &unique, &ut, 1);
+      }
+    }
+  }
+  array_free(&neighbors);
+
+  crystal_init(&cr, lc);
+  sarray_transfer(struct unique_t, &unique, p, 0, &cr);
+  crystal_free(&cr);
+
+  sarray_sort(struct unique_t, unique.ptr, unique.n, partition, 0, bfr);
+  sint un = 0;
+  if (unique.n > 0) {
+    un = 1;
+    struct unique_t *pu = (struct unique_t *)unique.ptr;
+    for (uint i = 1; i < unique.n; i++) {
+      if (pu[i].partition > pu[un - 1].partition)
+        pu[un] = pu[i], un++;
+    }
+  }
+  array_free(&unique);
+
+  sint wrk;
+  comm_allreduce(lc, gs_int, gs_add, &un, 1, &wrk);
+  assert(un >= 1);
+
+  return un - 1;
+}
+
+static struct array pgeom;
+static buffer bfr;
+static uint pgeom_initialized = 0;
+static uint nv = 0;
+static uint level = 0;
+
+struct pgeom_t {
+  uint partition, level;
+  double centroid[3], min[3], max[3];
+  uint p;
+};
+
+void parrsb_dump_stats_start(const uint nv_) {
+  if (pgeom_initialized)
+    return;
+
+  nv = nv_;
+  level = 0;
+  array_init(struct pgeom_t, &pgeom, 1024);
+  buffer_init(&bfr, 1024);
+
+  pgeom_initialized = 1;
+}
+
+void parrsb_dump_stats(const struct comm *const gc, const struct comm *const lc,
+                       const struct array *const elems, buffer *bfr) {
+  if (!pgeom_initialized)
+    return;
+
+  const struct rsb_element *const pe =
+      (const struct rsb_element *const)elems->ptr;
+
+  // Find the centroid and the bounding box of the partition.
+  double centroid[3] = {0.0, 0.0, 0.0};
+  double max[3] = {-DBL_MAX, -DBL_MAX, -DBL_MAX};
+  double min[3] = {DBL_MAX, DBL_MAX, DBL_MAX};
+  const uint n = elems->n;
+  const unsigned ndim = (nv == 8) ? 3 : 2;
+  for (uint e = 0; e < n; e++) {
+    for (uint d = 0; d < ndim; d++) {
+      double c = pe[e].coord[d];
+      centroid[d] += c;
+      max[d] = (max[d] < c) ? c : max[d];
+      min[d] = (min[d] > c) ? c : min[d];
+    }
+  }
+  for (uint d = 0; d < ndim; d++)
+    centroid[d] /= n;
+
+  double wrk[3];
+  comm_allreduce(lc, gs_double, gs_min, min, ndim, wrk);
+  comm_allreduce(lc, gs_double, gs_max, max, ndim, wrk);
+  comm_allreduce(lc, gs_double, gs_add, centroid, ndim, wrk);
+  for (uint d = 0; d < ndim; d++)
+    centroid[d] /= lc->np;
+
+  // Partition root accumulates the partition geometry.
+  level++;
+  struct pgeom_t pg = {.partition = get_partition(gc, lc),
+                       .level = level,
+                       .centroid = {centroid[0], centroid[1], centroid[2]},
+                       .max = {max[0], max[1], max[2]},
+                       .min = {min[0], min[1], min[2]},
+                       .p = 0};
+  if (lc->id == 0)
+    array_cat(struct pgeom_t, &pgeom, &pg, 1);
+}
+
+void parrsb_dump_stats_end(const struct comm *const gc, const char *prefix) {
+  if (!pgeom_initialized)
+    return;
+
+  const uint size = strnlen(prefix, 64);
+  assert(size < 64 && "Prefix must be less than 64 characters.");
+
+  // Send all the data to global root.
+  struct crystal cr;
+  crystal_init(&cr, gc);
+  sarray_transfer(struct pgeom_t, &pgeom, p, 0, &cr);
+  crystal_free(&cr);
+
+  // Sort by level first, then by partition id.
+  sarray_sort_2(struct pgeom_t, pgeom.ptr, pgeom.n, level, 0, partition, 0,
+                &bfr);
+
+  if (gc->id == 0) {
+    const char name[BUFSIZ];
+    snprintf((char *)name, BUFSIZ, "%s_partition_geom_p%06d.txt", prefix,
+             gc->np);
+
+    FILE *fp = fopen(name, "w");
+    if (!fp) {
+      fprintf(stderr, "Failed to open %s for writing.\n", name);
+      exit(EXIT_FAILURE);
+    }
+
+    fprintf(fp, "%zu\n", pgeom.n);
+    fprintf(fp, "level partition centroid[0] centroid[1] centroid[2] min[0] "
+                "min[1] min[2] max[0] max[1] max[2]\n");
+    const struct pgeom_t *const pg = (const struct pgeom_t *const)pgeom.ptr;
+    for (uint i = 0; i < pgeom.n; i++) {
+      fprintf(fp, "%u %u %lf %lf %lf %lf %lf %lf %lf %lf %lf\n", pg[i].level,
+              pg[i].partition, pg[i].centroid[0], pg[i].centroid[1],
+              pg[i].centroid[2], pg[i].min[0], pg[i].min[1], pg[i].min[2],
+              pg[i].max[0], pg[i].max[1], pg[i].max[2]);
+    }
+    fclose(fp);
+  }
+
+  array_free(&pgeom);
+  buffer_free(&bfr);
+
+  pgeom_initialized = nv = level = 0;
+}