From 5805dc776f4a6ff3da63fe15efb1502ae8d00ac8 Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Fri, 27 Sep 2024 07:18:32 -0400 Subject: [PATCH] For all 4 adix/oats test programs: - Change behavior on token overflow to truncate not skip and always print the too long length and a truncated version of it. - Make programs more lexically similar so that side-by-side's like `diff tests/ucl.nim util/lfreq.nim` to add counts `diff tests/ucl.nim tests/wu.nim` to add words `diff tests/wu.nim tests/wfr.nim` to add both words & counts are all less noisy. (Probably still not perfect, but closer.) --- tests/ucl.nim | 28 +++++++++++++--------------- tests/wfr.nim | 7 ++++--- tests/wu.nim | 7 ++++--- util/lfreq.nim | 10 +++++----- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/ucl.nim b/tests/ucl.nim index d209128..90f0771 100644 --- a/tests/ucl.nim +++ b/tests/ucl.nim @@ -12,42 +12,40 @@ type dat: seq[Count] nUsed: int -var a = " "; oatKStack a, Counts, Count, off,uint32, MSlice, MSlice -#proc key(c: var Counts, i: int, q: MSlice) = c.dat[i]=c.keyR(q) wrong&unneeded +var s = " "; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice proc key(c: Counts, i: int): MSlice = c.dat[i].key proc used(c: Counts, i: int): bool = c.dat[i].off!=0 - when defined hashCache: # def auto-triggers use proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32 proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash - oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable when Counts is ROat[MSlice, MSlice]: {.warning: "Counts is a ROat"} -proc incFailed(h: var Counts, r: MSlice): bool = - if r.len + 1 > 1 shl bLen: # Careful to not overflow - erru "skipping too long(", $r.len, ") line: ",$r,"\n" - return # Cannot go on LOCALLY - h.upSert(r, i): discard # Found key @i: nothing to do +proc incFailed(h: var Counts, ms: MSlice): bool = + var ms = ms + if ms.len > (1 shl bLen) - 1: # Careful to not overflow XXX rate limit msgs + erru "truncating too long (", $ms.len, ") line: ", ($ms)[0..<256], "...\n" + ms.len = (1 shl bLen) - 1 # Truncation makes count potentially off + h.upSert(ms, i): discard # Found key @i: nothing to do do: # Novel key->i: - h.dat[i].off = a.add(r, (1 shl bOff) - 1): - erru "unique word data overflow at:",$r,"\n" #XXX rate limit msgs + h.dat[i].off = s.add(ms, (1 shl bOff) - 1): + erru "unique word data overflow at:",$ms,"\n" #XXX rate limit msgs return true # Cannot go on GLOBALLY - h.dat[i].len = r.len.uint32 # Init + h.dat[i].len = ms.len.uint32 # Init proc ucl(size=9999, dSize=81920, tm=false) = ## Count unique & total lines on `stdin`. <256B long; <16 MiB unique data. let t0 = if tm: epochTime() else: 0.0 - var h: Counts; h.setCap size # Pre-size table & data - a.setLen dSize; a.setLen 1 + var h: Counts; h.setCap size # pre-size table & data + s.setLen dSize; s.setLen 1 var nTot = 0 block IO: for (line, nLine) in stdin.getDelims: let ms = MSlice(mem: line, len: nLine - 1) inc nTot # Always bump `nTotal` if h.incFailed(ms): break IO - echo h.len," unique ",nTot," total ",a.len," B" + echo h.len," unique ",nTot," total ",s.len," B" if tm: stderr.write epochTime() - t0, "\n" when isMainModule: dispatch ucl, help={ diff --git a/tests/wfr.nim b/tests/wfr.nim index 54dd268..05ce0c7 100644 --- a/tests/wfr.nim +++ b/tests/wfr.nim @@ -27,9 +27,10 @@ oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable when Counts is VROat[MSlice, MSlice, uint32]: {.warning: "Counts is a VROat"} proc incFailed(h: var Counts, ms: MSlice): bool = - if ms.len > (1 shl bLen) - 1: # Careful to not overflow - erru "skipping too long word: ",$ms,"\n" - return # Cannot go on LOCALLY + var ms = ms + if ms.len > (1 shl bLen) - 1: # Careful to not overflow XXX rate limit msgs + erru "truncating too long (", $ms.len, ") word: ", ($ms)[0..<32], "...\n" + ms.len = (1 shl bLen) - 1 # Truncation makes count potentially off h.upSert(ms, i): # Found key @i: if h.dat[i].cnt == (1 shl bCnt) - 1: erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit diff --git a/tests/wu.nim b/tests/wu.nim index b7b1a28..a476c29 100644 --- a/tests/wu.nim +++ b/tests/wu.nim @@ -23,9 +23,10 @@ oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable when Counts is ROat[MSlice, MSlice]: {.warning: "Counts is a ROat"} proc incFailed(h: var Counts, ms: MSlice): bool = - if ms.len > (1 shl bLen) - 1: # Careful to not overflow - erru "skipping too long word: ",$ms,"\n" - return # Cannot go on LOCALLY + var ms = ms + if ms.len > (1 shl bLen) - 1: # Careful to not overflow XXX rate limit msgs + erru "truncating too long (", $ms.len, ") word: ", ($ms)[0..<32], "...\n" + ms.len = (1 shl bLen) - 1 h.upSert(ms, i): discard # Found key @i: do: # Novel key->i: h.dat[i].off = s.add(ms, (1 shl bOff) - 1): diff --git a/util/lfreq.nim b/util/lfreq.nim index e01126d..8e4c4d1 100644 --- a/util/lfreq.nim +++ b/util/lfreq.nim @@ -27,16 +27,16 @@ oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable when Counts is VROat[MSlice, MSlice, uint32]: {.warning: "Counts is a VROat"} proc incFailed(h: var Counts, ms: MSlice): bool = - if ms.len > (1 shl bLen) - 1: # Careful to not overflow - erru "skipping too long line: ", ($ms)[0..<128], "...\n" - return false # Cannot go on LOCALLY + var ms = ms + if ms.len > (1 shl bLen) - 1: # Careful to not overflow XXX rate limit msgs + erru "truncating too long (", $ms.len, ") line: ", ($ms)[0..<128], "...\n" h.upSert(ms, i): # Found key @i: if h.dat[i].cnt == (1 shl bCnt) - 1: - erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit + erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit msgs else: h.dat[i].cnt.inc # bump do: # Novel key->i: h.dat[i].off = s.add(ms, (1 shl bOff) - 1): - erru "unique line data overflow at:",$ms,"\n" #XXX rate limit + erru "unique line data overflow at:",$ms,"\n" #XXX rate limit msgs return true # Cannot go on GLOBALLY h.dat[i].len = ms.len.uint32# Init h.dat[i].cnt = 1u32