From bca84b5d595419af487bc68e09d67f40c092ed2e Mon Sep 17 00:00:00 2001 From: Eric Humphrey Date: Thu, 6 Jul 2017 10:25:54 +0100 Subject: [PATCH 1/5] Added a test that stacked bootstrapped streams are approximately uniform at large N --- tests/test_mux.py | 52 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/tests/test_mux.py b/tests/test_mux.py index 7f2ae0e..d0ad57e 100644 --- a/tests/test_mux.py +++ b/tests/test_mux.py @@ -1,7 +1,6 @@ import pytest import collections -import itertools import numpy as np import random @@ -219,11 +218,19 @@ def test_critical_mux(): print(collections.Counter(samples)) +def _choice(vals): + while True: + yield random.choice(vals) + + +def _cycle(values): + while True: + for v in values: + yield v + + def test_critical_mux_of_rate_limited_muxes(): # Check on Issue #79 - def _choice(vals): - while True: - yield random.choice(vals) ab = pescador.Streamer(_choice, 'ab') cd = pescador.Streamer(_choice, 'cd') @@ -258,11 +265,6 @@ def test_restart_mux(): def test_sampled_mux_of_muxes(): - def _cycle(values): - while True: - for v in values: - yield v - # Build some sample streams ab = pescador.Streamer(_cycle, 'ab') cd = pescador.Streamer(_cycle, 'cd') @@ -310,3 +312,35 @@ def test_mux_inf_loop(): with_replacement=False, random_state=1234) assert len(list(mux(max_iter=100))) == 0 + + +def test_mux_stacked_uniform_convergence(): + ab = pescador.Streamer(_choice, 'ab') + cd = pescador.Streamer(_choice, 'cd') + ef = pescador.Streamer(_choice, 'ef') + mux1 = pescador.Mux([ab, cd, ef], k=2, rate=2, + with_replacement=False, revive=True) + + gh = pescador.Streamer(_choice, 'gh') + ij = pescador.Streamer(_choice, 'ij') + kl = pescador.Streamer(_choice, 'kl') + + mux2 = pescador.Mux([gh, ij, kl], k=2, rate=2, + with_replacement=False, revive=True) + + stacked_mux = pescador.Mux([mux1, mux2], k=2, rate=None, + with_replacement=False, revive=True) + + flat_mux = pescador.Mux([ab, cd, ef, gh, ij, kl], k=6, rate=None, + with_replacement=False, revive=False) + + max_iter = 50000 + samples1 = list(stacked_mux.iterate(max_iter=max_iter)) + samples2 = list(flat_mux.iterate(max_iter=max_iter)) + count1 = collections.Counter(samples1) + count2 = collections.Counter(samples2) + print(count1, count2) + assert set('abcdefghijkl') == set(count1.keys()) == set(count2.keys()) + c1, c2 = [list(c.values()) for c in (count1, count2)] + np.testing.assert_almost_equal( + np.std(c1) / max_iter, np.std(c2) / max_iter, decimal=2) From 7cbddcde60c5a6d3cb6c1fcfc9aa9c76976bed05 Mon Sep 17 00:00:00 2001 From: Eric Humphrey Date: Fri, 7 Jul 2017 14:50:59 +0100 Subject: [PATCH 2/5] Fixed comments from PR --- tests/test_mux.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/test_mux.py b/tests/test_mux.py index d0ad57e..01c6c36 100644 --- a/tests/test_mux.py +++ b/tests/test_mux.py @@ -161,7 +161,6 @@ def test_mux_of_muxes_itered(): random_state=135) samples1 = mux1.iterate(max_iter=1000) count1 = collections.Counter(samples1) - print(count1) assert set('abcxyz') == set(count1.keys()) n123 = pescador.Streamer('123') @@ -171,7 +170,6 @@ def test_mux_of_muxes_itered(): random_state=246) samples2 = mux2.iterate(max_iter=1000) count2 = collections.Counter(samples2) - print(count2) assert set('123456') == set(count2.keys()) # Note that (random_state=987, k=2) fails. @@ -180,7 +178,6 @@ def test_mux_of_muxes_itered(): random_state=987) samples3 = mux3.iterate(max_iter=1000) count3 = collections.Counter(samples3) - print(count3) assert set('abcxyz123456') == set(count3.keys()) @@ -203,19 +200,20 @@ def test_mux_of_muxes_single(): prune_empty_streams=False) samples3 = list(mux3.iterate(max_iter=10000)) count3 = collections.Counter(samples3) - print(samples3[:10], count3) assert set('abcxyz123456') == set(count3.keys()) def test_critical_mux(): # Check on Issue #80 chars = 'abcde' - streamers = [pescador.Streamer(x * 5) for x in chars] + n_reps = 5 + streamers = [pescador.Streamer(x * n_reps) for x in chars] mux = pescador.Mux(streamers, k=len(chars), rate=None, - with_replacement=False, revive=True, + with_replacement=False, revive=False, prune_empty_streams=False, random_state=135) - samples = mux.iterate(max_iter=1000) - print(collections.Counter(samples)) + samples = list(mux.iterate(max_iter=1000)) + assert len(collections.Counter(samples)) == len(chars) + assert len(samples) == len(chars) * n_reps def _choice(vals): @@ -251,7 +249,6 @@ def test_critical_mux_of_rate_limited_muxes(): count = collections.Counter(samples) max_count, min_count = max(count.values()), min(count.values()) assert (max_count - min_count) / max_count < 0.2 - print(count) assert set('abcdefghijkl') == set(count.keys()) @@ -275,7 +272,7 @@ def test_sampled_mux_of_muxes(): # And inspect the first mux samples1 = list(mux1(max_iter=6 * 10)) count1 = collections.Counter(samples1) - print(count1) + assert set(count1.keys()) == set('abcdef') # Build another set of streams @@ -288,7 +285,6 @@ def test_sampled_mux_of_muxes(): # And inspect the second mux samples2 = list(mux2(max_iter=6 * 10)) count2 = collections.Counter(samples2) - print(count2) assert set(count2.keys()) == set('ghijkl') # Merge the muxes together. @@ -296,7 +292,6 @@ def test_sampled_mux_of_muxes(): with_replacement=False, revive=False) samples3 = list(mux3.iterate(max_iter=10000)) count3 = collections.Counter(samples3) - print(count3) assert set('abcdefghijkl') == set(count3.keys()) max_count, min_count = max(count3.values()), min(count3.values()) assert (max_count - min_count) / max_count < 0.2 @@ -315,31 +310,36 @@ def test_mux_inf_loop(): def test_mux_stacked_uniform_convergence(): + """This test is designed to check that boostrapped streams of data + (Streamer subsampling, rate limiting) cascaded through multiple + multiplexors converges in expectation to a flat, uniform sample of the + stream directly. + """ ab = pescador.Streamer(_choice, 'ab') cd = pescador.Streamer(_choice, 'cd') ef = pescador.Streamer(_choice, 'ef') - mux1 = pescador.Mux([ab, cd, ef], k=2, rate=2, - with_replacement=False, revive=True) + mux1 = pescador.Mux([ab, cd, ef], k=2, rate=2, with_replacement=False, + revive=True, random_state=1357) gh = pescador.Streamer(_choice, 'gh') ij = pescador.Streamer(_choice, 'ij') kl = pescador.Streamer(_choice, 'kl') - mux2 = pescador.Mux([gh, ij, kl], k=2, rate=2, - with_replacement=False, revive=True) + mux2 = pescador.Mux([gh, ij, kl], k=2, rate=2, with_replacement=False, + revive=True, random_state=2468) stacked_mux = pescador.Mux([mux1, mux2], k=2, rate=None, - with_replacement=False, revive=True) + with_replacement=False, revive=True, + random_state=159) - flat_mux = pescador.Mux([ab, cd, ef, gh, ij, kl], k=6, rate=None, - with_replacement=False, revive=False) + flat_mux = pescador.Streamer(_choice, 'abcdefghijkl') max_iter = 50000 samples1 = list(stacked_mux.iterate(max_iter=max_iter)) samples2 = list(flat_mux.iterate(max_iter=max_iter)) count1 = collections.Counter(samples1) count2 = collections.Counter(samples2) - print(count1, count2) + assert set('abcdefghijkl') == set(count1.keys()) == set(count2.keys()) c1, c2 = [list(c.values()) for c in (count1, count2)] np.testing.assert_almost_equal( From 1f8fea99c3e6fb11d9cf62633cfda99947fd980f Mon Sep 17 00:00:00 2001 From: Eric Humphrey Date: Fri, 7 Jul 2017 17:59:07 +0100 Subject: [PATCH 3/5] Updated convergence test. --- tests/test_mux.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/test_mux.py b/tests/test_mux.py index 01c6c36..3b960b9 100644 --- a/tests/test_mux.py +++ b/tests/test_mux.py @@ -330,17 +330,17 @@ def test_mux_stacked_uniform_convergence(): stacked_mux = pescador.Mux([mux1, mux2], k=2, rate=None, with_replacement=False, revive=True, - random_state=159) - - flat_mux = pescador.Streamer(_choice, 'abcdefghijkl') + random_state=12345) max_iter = 50000 - samples1 = list(stacked_mux.iterate(max_iter=max_iter)) - samples2 = list(flat_mux.iterate(max_iter=max_iter)) - count1 = collections.Counter(samples1) - count2 = collections.Counter(samples2) + chars = 'abcdefghijkl' + samples = list(stacked_mux.iterate(max_iter=max_iter)) + counter = collections.Counter(samples) + assert set(chars) == set(counter.keys()) + + counts = np.array(counter.values()) + exp_count = float(max_iter / len(chars)) + max_error = np.max(np.abs(counts - exp_count) / exp_count) - assert set('abcdefghijkl') == set(count1.keys()) == set(count2.keys()) - c1, c2 = [list(c.values()) for c in (count1, count2)] - np.testing.assert_almost_equal( - np.std(c1) / max_iter, np.std(c2) / max_iter, decimal=2) + # Confirm the max difference is under 5% -- for these seeds, it's 2.2 + assert max_error < 0.05 From 696592eb242b61375d54d3be6961c8582c8efae8 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Fri, 18 Aug 2017 16:08:14 -0400 Subject: [PATCH 4/5] fixed a type error in this one --- tests/test_mux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_mux.py b/tests/test_mux.py index 3b960b9..9326e8c 100644 --- a/tests/test_mux.py +++ b/tests/test_mux.py @@ -2,6 +2,7 @@ import collections import numpy as np +import scipy.stats import random import pescador @@ -338,7 +339,7 @@ def test_mux_stacked_uniform_convergence(): counter = collections.Counter(samples) assert set(chars) == set(counter.keys()) - counts = np.array(counter.values()) + counts = np.asarray(list(counter.values())) exp_count = float(max_iter / len(chars)) max_error = np.max(np.abs(counts - exp_count) / exp_count) From abb4bbb13716dbd1f318cc5161f870509d23783a Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Mon, 21 Aug 2017 11:24:28 -0400 Subject: [PATCH 5/5] fixed seed for stacked mux test, rewrote to use chisquare --- tests/test_mux.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_mux.py b/tests/test_mux.py index 9326e8c..bd14d59 100644 --- a/tests/test_mux.py +++ b/tests/test_mux.py @@ -3,7 +3,6 @@ import collections import numpy as np import scipy.stats -import random import pescador import pescador.mux @@ -217,9 +216,11 @@ def test_critical_mux(): assert len(samples) == len(chars) * n_reps -def _choice(vals): +def _choice(vals, seed=11111): + rng = np.random.RandomState(seed=seed) + n = len(vals) while True: - yield random.choice(vals) + yield vals[rng.randint(0, n)] def _cycle(values): @@ -333,15 +334,14 @@ def test_mux_stacked_uniform_convergence(): with_replacement=False, revive=True, random_state=12345) - max_iter = 50000 + max_iter = 1000 chars = 'abcdefghijkl' samples = list(stacked_mux.iterate(max_iter=max_iter)) counter = collections.Counter(samples) assert set(chars) == set(counter.keys()) counts = np.asarray(list(counter.values())) - exp_count = float(max_iter / len(chars)) - max_error = np.max(np.abs(counts - exp_count) / exp_count) - # Confirm the max difference is under 5% -- for these seeds, it's 2.2 - assert max_error < 0.05 + # Check that the pvalue for the chi^2 test is at least 0.95 + test = scipy.stats.chisquare(counts) + assert test.pvalue >= 0.95