diff --git a/Makefile b/Makefile
index fc1f5a4..13d6604 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ dut_donnabad: $(OBJS_DONNABAD) examples/donnabad/dut_donnabad.c
 
 dut_simple: examples/simple/example.c
 	# higher compiler optimization levels can make this constant time
-	$(CC) $(LDFLAGS) -O0 $(INCS) -o dudect_simple_O0 examples/simple/example.c $(LIBS)
+	$(CC) $(LDFLAGS) -O0 $(INCS) -DMEASUREMENTS_PER_CHUNK=100000 -o dudect_simple_O0 examples/simple/example.c $(LIBS)
 	$(CC) $(LDFLAGS) -O2 $(INCS) -DMEASUREMENTS_PER_CHUNK=100000 -o dudect_simple_O2 examples/simple/example.c $(LIBS)
 
 .c.o:
diff --git a/README.md b/README.md
index db6197d..67bbe95 100644
--- a/README.md
+++ b/README.md
@@ -195,6 +195,7 @@ The following people have contributed to `dudect` through code, bug reports, iss
 * RashidAlsuwaidi
 * paul90317
 * Fabian Albert (https://github.com/FAlbertDev)
+* Anjan Roy (https://github.com/itzmeanjan)
 
 The approach is described in this paper
 > Oscar Reparaz, Josep Balasch and Ingrid Verbauwhede
diff --git a/examples/simple/example.c b/examples/simple/example.c
index 0728143..3b89384 100644
--- a/examples/simple/example.c
+++ b/examples/simple/example.c
@@ -10,7 +10,7 @@ int check_tag(uint8_t *x, uint8_t *y, size_t len) {
   return memcmp(x, y, len);
 }
 
-#define SECRET_LEN_BYTES (16)
+#define SECRET_LEN_BYTES (512)
 
 uint8_t secret[SECRET_LEN_BYTES] = {0, 1, 2, 3, 4, 5, 6, 42};
 
diff --git a/src/dudect.h b/src/dudect.h
index b08bb57..ee40912 100644
--- a/src/dudect.h
+++ b/src/dudect.h
@@ -71,6 +71,8 @@ extern "C" {
 
 #include <stddef.h>
 #include <stdint.h>
+#include <emmintrin.h>
+#include <x86intrin.h>
 
 #ifdef DUDECT_VISIBLITY_STATIC
 #define DUDECT_VISIBILITY static
@@ -202,7 +204,6 @@ static int cmp(const int64_t *a, const int64_t *b) { return (int)(*a - *b); }
 
 static int64_t percentile(int64_t *a_sorted, double which, size_t size) {
   size_t array_position = (size_t)((double)size * (double)which);
-  assert(array_position >= 0);
   assert(array_position < size);
   return a_sorted[array_position];
 }
@@ -262,16 +263,20 @@ uint8_t randombit(void) {
 }
 
 /*
- Intel actually recommends calling CPUID to serialize the execution flow
- and reduce variance in measurement due to out-of-order execution.
- We don't do that here yet.
- see §3.2.1 http://www.intel.com/content/www/us/en/embedded/training/ia-32-ia-64-benchmark-code-execution-paper.html
-*/
-static int64_t cpucycles(void) {
-  unsigned int hi, lo;
+ Returns current CPU tick count from *T*ime *S*tamp *C*ounter.
+
+ To enforce CPU to issue RDTSC instruction where we want it to, we put a `mfence` instruction before
+ issuing `rdtsc`, which should make all memory load/ store operations, prior to RDTSC, globally visible.
 
-  __asm__ volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi));
-  return ((int64_t)lo) | (((int64_t)hi) << 32);
+ See https://github.com/oreparaz/dudect/issues/32
+ See RDTSC documentation @ https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htm#text=rdtsc&ig_expand=4395,5273
+ See MFENCE documentation @ https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htm#text=mfence&ig_expand=4395,5273,4395
+
+ Also see https://stackoverflow.com/a/12634857
+*/
+static inline int64_t cpucycles(void) {
+  _mm_mfence();
+  return (int64_t)__rdtsc();
 }
 
 // threshold values for Welch's t-test