From bca84b5d595419af487bc68e09d67f40c092ed2e Mon Sep 17 00:00:00 2001
From: Eric Humphrey <humphrey.eric@gmail.com>
Date: Thu, 6 Jul 2017 10:25:54 +0100
Subject: [PATCH 1/5] Added a test that stacked bootstrapped streams are
 approximately uniform at large N

---
 tests/test_mux.py | 52 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/tests/test_mux.py b/tests/test_mux.py
index 7f2ae0e..d0ad57e 100644
--- a/tests/test_mux.py
+++ b/tests/test_mux.py
@@ -1,7 +1,6 @@
 import pytest
 
 import collections
-import itertools
 import numpy as np
 import random
 
@@ -219,11 +218,19 @@ def test_critical_mux():
     print(collections.Counter(samples))
 
 
+def _choice(vals):
+    while True:
+        yield random.choice(vals)
+
+
+def _cycle(values):
+    while True:
+        for v in values:
+            yield v
+
+
 def test_critical_mux_of_rate_limited_muxes():
     # Check on Issue #79
-    def _choice(vals):
-        while True:
-            yield random.choice(vals)
 
     ab = pescador.Streamer(_choice, 'ab')
     cd = pescador.Streamer(_choice, 'cd')
@@ -258,11 +265,6 @@ def test_restart_mux():
 
 def test_sampled_mux_of_muxes():
 
-    def _cycle(values):
-        while True:
-            for v in values:
-                yield v
-
     # Build some sample streams
     ab = pescador.Streamer(_cycle, 'ab')
     cd = pescador.Streamer(_cycle, 'cd')
@@ -310,3 +312,35 @@ def test_mux_inf_loop():
                        with_replacement=False, random_state=1234)
 
     assert len(list(mux(max_iter=100))) == 0
+
+
+def test_mux_stacked_uniform_convergence():
+    ab = pescador.Streamer(_choice, 'ab')
+    cd = pescador.Streamer(_choice, 'cd')
+    ef = pescador.Streamer(_choice, 'ef')
+    mux1 = pescador.Mux([ab, cd, ef], k=2, rate=2,
+                        with_replacement=False, revive=True)
+
+    gh = pescador.Streamer(_choice, 'gh')
+    ij = pescador.Streamer(_choice, 'ij')
+    kl = pescador.Streamer(_choice, 'kl')
+
+    mux2 = pescador.Mux([gh, ij, kl], k=2, rate=2,
+                        with_replacement=False, revive=True)
+
+    stacked_mux = pescador.Mux([mux1, mux2], k=2, rate=None,
+                               with_replacement=False, revive=True)
+
+    flat_mux = pescador.Mux([ab, cd, ef, gh, ij, kl], k=6, rate=None,
+                            with_replacement=False, revive=False)
+
+    max_iter = 50000
+    samples1 = list(stacked_mux.iterate(max_iter=max_iter))
+    samples2 = list(flat_mux.iterate(max_iter=max_iter))
+    count1 = collections.Counter(samples1)
+    count2 = collections.Counter(samples2)
+    print(count1, count2)
+    assert set('abcdefghijkl') == set(count1.keys()) == set(count2.keys())
+    c1, c2 = [list(c.values()) for c in (count1, count2)]
+    np.testing.assert_almost_equal(
+        np.std(c1) / max_iter, np.std(c2) / max_iter, decimal=2)

From 7cbddcde60c5a6d3cb6c1fcfc9aa9c76976bed05 Mon Sep 17 00:00:00 2001
From: Eric Humphrey <humphrey.eric@gmail.com>
Date: Fri, 7 Jul 2017 14:50:59 +0100
Subject: [PATCH 2/5] Fixed comments from PR

---
 tests/test_mux.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/tests/test_mux.py b/tests/test_mux.py
index d0ad57e..01c6c36 100644
--- a/tests/test_mux.py
+++ b/tests/test_mux.py
@@ -161,7 +161,6 @@ def test_mux_of_muxes_itered():
                         random_state=135)
     samples1 = mux1.iterate(max_iter=1000)
     count1 = collections.Counter(samples1)
-    print(count1)
     assert set('abcxyz') == set(count1.keys())
 
     n123 = pescador.Streamer('123')
@@ -171,7 +170,6 @@ def test_mux_of_muxes_itered():
                         random_state=246)
     samples2 = mux2.iterate(max_iter=1000)
     count2 = collections.Counter(samples2)
-    print(count2)
     assert set('123456') == set(count2.keys())
 
     # Note that (random_state=987, k=2) fails.
@@ -180,7 +178,6 @@ def test_mux_of_muxes_itered():
                         random_state=987)
     samples3 = mux3.iterate(max_iter=1000)
     count3 = collections.Counter(samples3)
-    print(count3)
     assert set('abcxyz123456') == set(count3.keys())
 
 
@@ -203,19 +200,20 @@ def test_mux_of_muxes_single():
                         prune_empty_streams=False)
     samples3 = list(mux3.iterate(max_iter=10000))
     count3 = collections.Counter(samples3)
-    print(samples3[:10], count3)
     assert set('abcxyz123456') == set(count3.keys())
 
 
 def test_critical_mux():
     # Check on Issue #80
     chars = 'abcde'
-    streamers = [pescador.Streamer(x * 5) for x in chars]
+    n_reps = 5
+    streamers = [pescador.Streamer(x * n_reps) for x in chars]
     mux = pescador.Mux(streamers, k=len(chars), rate=None,
-                       with_replacement=False, revive=True,
+                       with_replacement=False, revive=False,
                        prune_empty_streams=False, random_state=135)
-    samples = mux.iterate(max_iter=1000)
-    print(collections.Counter(samples))
+    samples = list(mux.iterate(max_iter=1000))
+    assert len(collections.Counter(samples)) == len(chars)
+    assert len(samples) == len(chars) * n_reps
 
 
 def _choice(vals):
@@ -251,7 +249,6 @@ def test_critical_mux_of_rate_limited_muxes():
     count = collections.Counter(samples)
     max_count, min_count = max(count.values()), min(count.values())
     assert (max_count - min_count) / max_count < 0.2
-    print(count)
     assert set('abcdefghijkl') == set(count.keys())
 
 
@@ -275,7 +272,7 @@ def test_sampled_mux_of_muxes():
     # And inspect the first mux
     samples1 = list(mux1(max_iter=6 * 10))
     count1 = collections.Counter(samples1)
-    print(count1)
+
     assert set(count1.keys()) == set('abcdef')
 
     # Build another set of streams
@@ -288,7 +285,6 @@ def test_sampled_mux_of_muxes():
     # And inspect the second mux
     samples2 = list(mux2(max_iter=6 * 10))
     count2 = collections.Counter(samples2)
-    print(count2)
     assert set(count2.keys()) == set('ghijkl')
 
     # Merge the muxes together.
@@ -296,7 +292,6 @@ def test_sampled_mux_of_muxes():
                         with_replacement=False, revive=False)
     samples3 = list(mux3.iterate(max_iter=10000))
     count3 = collections.Counter(samples3)
-    print(count3)
     assert set('abcdefghijkl') == set(count3.keys())
     max_count, min_count = max(count3.values()), min(count3.values())
     assert (max_count - min_count) / max_count < 0.2
@@ -315,31 +310,36 @@ def test_mux_inf_loop():
 
 
 def test_mux_stacked_uniform_convergence():
+    """This test is designed to check that boostrapped streams of data
+    (Streamer subsampling, rate limiting) cascaded through multiple
+    multiplexors converges in expectation to a flat, uniform sample of the
+    stream directly.
+    """
     ab = pescador.Streamer(_choice, 'ab')
     cd = pescador.Streamer(_choice, 'cd')
     ef = pescador.Streamer(_choice, 'ef')
-    mux1 = pescador.Mux([ab, cd, ef], k=2, rate=2,
-                        with_replacement=False, revive=True)
+    mux1 = pescador.Mux([ab, cd, ef], k=2, rate=2, with_replacement=False,
+                        revive=True, random_state=1357)
 
     gh = pescador.Streamer(_choice, 'gh')
     ij = pescador.Streamer(_choice, 'ij')
     kl = pescador.Streamer(_choice, 'kl')
 
-    mux2 = pescador.Mux([gh, ij, kl], k=2, rate=2,
-                        with_replacement=False, revive=True)
+    mux2 = pescador.Mux([gh, ij, kl], k=2, rate=2, with_replacement=False,
+                        revive=True, random_state=2468)
 
     stacked_mux = pescador.Mux([mux1, mux2], k=2, rate=None,
-                               with_replacement=False, revive=True)
+                               with_replacement=False, revive=True,
+                               random_state=159)
 
-    flat_mux = pescador.Mux([ab, cd, ef, gh, ij, kl], k=6, rate=None,
-                            with_replacement=False, revive=False)
+    flat_mux = pescador.Streamer(_choice, 'abcdefghijkl')
 
     max_iter = 50000
     samples1 = list(stacked_mux.iterate(max_iter=max_iter))
     samples2 = list(flat_mux.iterate(max_iter=max_iter))
     count1 = collections.Counter(samples1)
     count2 = collections.Counter(samples2)
-    print(count1, count2)
+
     assert set('abcdefghijkl') == set(count1.keys()) == set(count2.keys())
     c1, c2 = [list(c.values()) for c in (count1, count2)]
     np.testing.assert_almost_equal(

From 1f8fea99c3e6fb11d9cf62633cfda99947fd980f Mon Sep 17 00:00:00 2001
From: Eric Humphrey <humphrey.eric@gmail.com>
Date: Fri, 7 Jul 2017 17:59:07 +0100
Subject: [PATCH 3/5] Updated convergence test.

---
 tests/test_mux.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/test_mux.py b/tests/test_mux.py
index 01c6c36..3b960b9 100644
--- a/tests/test_mux.py
+++ b/tests/test_mux.py
@@ -330,17 +330,17 @@ def test_mux_stacked_uniform_convergence():
 
     stacked_mux = pescador.Mux([mux1, mux2], k=2, rate=None,
                                with_replacement=False, revive=True,
-                               random_state=159)
-
-    flat_mux = pescador.Streamer(_choice, 'abcdefghijkl')
+                               random_state=12345)
 
     max_iter = 50000
-    samples1 = list(stacked_mux.iterate(max_iter=max_iter))
-    samples2 = list(flat_mux.iterate(max_iter=max_iter))
-    count1 = collections.Counter(samples1)
-    count2 = collections.Counter(samples2)
+    chars = 'abcdefghijkl'
+    samples = list(stacked_mux.iterate(max_iter=max_iter))
+    counter = collections.Counter(samples)
+    assert set(chars) == set(counter.keys())
+
+    counts = np.array(counter.values())
+    exp_count = float(max_iter / len(chars))
+    max_error = np.max(np.abs(counts - exp_count) / exp_count)
 
-    assert set('abcdefghijkl') == set(count1.keys()) == set(count2.keys())
-    c1, c2 = [list(c.values()) for c in (count1, count2)]
-    np.testing.assert_almost_equal(
-        np.std(c1) / max_iter, np.std(c2) / max_iter, decimal=2)
+    # Confirm the max difference is under 5% -- for these seeds, it's 2.2
+    assert max_error < 0.05

From 696592eb242b61375d54d3be6961c8582c8efae8 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Fri, 18 Aug 2017 16:08:14 -0400
Subject: [PATCH 4/5] fixed a type error in this one

---
 tests/test_mux.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_mux.py b/tests/test_mux.py
index 3b960b9..9326e8c 100644
--- a/tests/test_mux.py
+++ b/tests/test_mux.py
@@ -2,6 +2,7 @@
 
 import collections
 import numpy as np
+import scipy.stats
 import random
 
 import pescador
@@ -338,7 +339,7 @@ def test_mux_stacked_uniform_convergence():
     counter = collections.Counter(samples)
     assert set(chars) == set(counter.keys())
 
-    counts = np.array(counter.values())
+    counts = np.asarray(list(counter.values()))
     exp_count = float(max_iter / len(chars))
     max_error = np.max(np.abs(counts - exp_count) / exp_count)
 

From abb4bbb13716dbd1f318cc5161f870509d23783a Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Mon, 21 Aug 2017 11:24:28 -0400
Subject: [PATCH 5/5] fixed seed for stacked mux test, rewrote to use chisquare

---
 tests/test_mux.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/test_mux.py b/tests/test_mux.py
index 9326e8c..bd14d59 100644
--- a/tests/test_mux.py
+++ b/tests/test_mux.py
@@ -3,7 +3,6 @@
 import collections
 import numpy as np
 import scipy.stats
-import random
 
 import pescador
 import pescador.mux
@@ -217,9 +216,11 @@ def test_critical_mux():
     assert len(samples) == len(chars) * n_reps
 
 
-def _choice(vals):
+def _choice(vals, seed=11111):
+    rng = np.random.RandomState(seed=seed)
+    n = len(vals)
     while True:
-        yield random.choice(vals)
+        yield vals[rng.randint(0, n)]
 
 
 def _cycle(values):
@@ -333,15 +334,14 @@ def test_mux_stacked_uniform_convergence():
                                with_replacement=False, revive=True,
                                random_state=12345)
 
-    max_iter = 50000
+    max_iter = 1000
     chars = 'abcdefghijkl'
     samples = list(stacked_mux.iterate(max_iter=max_iter))
     counter = collections.Counter(samples)
     assert set(chars) == set(counter.keys())
 
     counts = np.asarray(list(counter.values()))
-    exp_count = float(max_iter / len(chars))
-    max_error = np.max(np.abs(counts - exp_count) / exp_count)
 
-    # Confirm the max difference is under 5% -- for these seeds, it's 2.2
-    assert max_error < 0.05
+    # Check that the pvalue for the chi^2 test is at least 0.95
+    test = scipy.stats.chisquare(counts)
+    assert test.pvalue >= 0.95