From b39f8c133f4eb9183d022991bbf0e24ace423198 Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Tue, 30 Jul 2024 04:59:12 -0400 Subject: [PATCH] Use new `adix/topk` module to centralize/simplify, generalize (to asc, desc, cheap), and speed-up such work in client code. For name hygiene, libraries `ditab`, `lptabz`, `oats` re-export only `topk.TopKOrder`. (But really things like `topByVal` that work on any Key-Value `pairs` should be lifted into their own adix/algos module anyway.) As part of the update, consistify `tests/wf|wfr`, `util/lfreq` Re: `-n` use & add a "..." for the overlong lines in `lfreq`. --- adix/ditab.nim | 19 +++++-------------- adix/lptabz.nim | 23 +++++++---------------- adix/oats.nim | 20 +++++--------------- tests/wf.nim | 26 +++++++------------------- tests/wfr.nim | 7 ++++--- util/lfreq.nim | 17 +++++++---------- 6 files changed, 35 insertions(+), 77 deletions(-) diff --git a/adix/ditab.nim b/adix/ditab.nim index 7c242a8..4fef7ea 100644 --- a/adix/ditab.nim +++ b/adix/ditab.nim @@ -21,7 +21,7 @@ ## under another term like "direct indexing". `K` below must have an available ## conversion to ``int``. Duplicate keys cannot be allowed for this one. -import althash, sequint, heapqueue +import althash, sequint, topk; export topk.TopKOrder when not declared(assert): import std/assertions type DITab*[K,V] = object ## Alphabet size determines `K`; `V` may be `void` @@ -425,19 +425,10 @@ proc inc*[K,V: SomeInteger](t: var DITab[K,V], key: K, proc merge*[K,V: SomeInteger](c: var DITab[K,V], b: DITab[K,V]) = for key, val in b: c.inc(key, val) -iterator topByVal*[K,V](c: DITab[K,V], n=10, min=V.low): (K, V) = - var q = initHeapQueue[(V, K)]() - for key, val in c: - if val >= min: - let e = (val, key) - if q.len < n: q.push(e) - elif e > q[0]: discard q.replace(e) - var y: (K, V) - while q.len > 0: # q now has top n entries - let r = q.pop - y[0] = r[1] - y[1] = r[0] - yield y # yield in ascending order +iterator topByVal*[K,V](c: DITab[K,V], n=10, min=V.low, order=Cheap): (K, V) = + var t = initTopK[(V,K)](n) + for k, v in ditab.pairs(c): (if v >= min: t.push (v, k)) + for e in topk.maybeOrdered(t, order): yield (e[1], e[0]) proc initDISet*[K](initialSize=0, numer=diNumer, denom=diDenom, minFree=diMinFree, growPow2=diGrowPow2, rehash=diRehash, diff --git a/adix/lptabz.nim b/adix/lptabz.nim index b9f2d21..6c4c87e 100644 --- a/adix/lptabz.nim +++ b/adix/lptabz.nim @@ -33,8 +33,8 @@ ## `seq[(K,V)]`. 6..8 bits avoids most "double cache misses" for miss ## lookups/inserts. `z=0` works if space matters more than time. -import althash, memutil, bitop, heapqueue, sequint, strutils, memfiles -export Hash, sequint +import althash, memutil, bitop, topk, sequint, std/[strutils, memfiles] +export Hash, sequint, topk.TopKOrder when not declared(assert): import std/[assertions, objectdollar] when declared(File): template stdOpen(x: varargs[untyped]): untyped = system.open(x) @@ -1082,21 +1082,12 @@ proc merge*[K,V,Z;z:static int](c: var LPTabz[K,V,Z,z], b: LPTabz[K,V,Z,z]) = for key, val in b: c.inc(key, val) iterator topByVal*[K,V,Z;z:static int](c: LPTabz[K,V,Z,z], n=10, - min=V.low): (K, V) = + min=V.low, order=Cheap): (K, V) = ## Iterate from smallest to largest over biggest `n` items by value in `c`. - ## If `n==0` this is effectively heap sort of `c` by value `V`. - var q = initHeapQueue[(V, K)]() - for key, val in c: - if val >= min: - let e = (val, key) - if n == 0 or q.len < n: q.push(e) - elif e > q[0]: discard q.replace(e) - var y: (K, V) - while q.len > 0: # q now has top n entries - let r = q.pop - y[0] = r[1] - y[1] = r[0] - yield y # yield in ascending order + ## `order` can be `Cheap`, `Ascending`, or `Descending`. + var t = initTopK[(V,K)](n) + for k, v in lptabz.pairs(c): (if v >= min: t.push (v, k)) + for e in topk.maybeOrdered(t, order): yield (e[1], e[0]) iterator mostCommon*[K](xs: openArray[K], n=10): (K, int) = ## Iterate over (`n` most common values in `xs`, their counts) tuples. diff --git a/adix/oats.nim b/adix/oats.nim index 7995695..be4ab65 100644 --- a/adix/oats.nim +++ b/adix/oats.nim @@ -1,4 +1,4 @@ -import std/[hashes, heapqueue], adix/bitop +import std/hashes, adix/[bitop, topk]; export topk.TopKOrder template pua*(T: typedesc): untyped = ptr UncheckedArray[T] # Since want no `setCap | pairs` to exist for fixed size | set-like `t`, what is # `concept` is driven by the external interface, but performance tweaks { like @@ -159,22 +159,12 @@ iterator values*[K,Q,V](t: VOat[K,Q,V]): V = iterator pairs*[K,Q,V](t: VOat[K,Q,V]): (K, V) = for i in 0 ..< t.cap: (if t.used i: yield (t.key i, t.val i)) -iterator topByVal*[K,Q,V](s: VOat[K,Q,V], n=10, min=V.low): (K, V) = +iterator topByVal*[K,Q,V](s: VOat[K,Q,V], n=10, min=V.low, order=topk.Cheap): (K, V)= ## Iterate from smallest to largest over biggest `n` items by value in `s`. ## If `n==0` this is effectively heapSort of `s` by value `V`. - proc `<`(a, b: (V,K)): bool = a[0] < b[0] # ignore K => only partial order - var q = initHeapQueue[(V,K)]() - for k, v in oats.pairs(s): - if v >= min: - let e = (v, k) - if n == 0 or q.len < n: q.push e - elif e > q[0]: discard q.replace(e) - var y: (K,V) - while q.len > 0: # q now has top n entries - let r = q.pop - y[0] = r[1] - y[1] = r[0] - yield y # yield in ascending order + var t = initTopK[(V,K)](n) + for k, v in oats.pairs(s): (if v >= min: t.push (v, k)) + for e in topk.maybeOrdered(t, order): yield (e[1], e[0]) template oatKStack*(s, Self, Cell, off, offT, K, Q) = ## Def routines for back-to-back/stacked variable length, unpadded key data. diff --git a/tests/wf.nim b/tests/wf.nim index d924d76..84411c8 100644 --- a/tests/wf.nim +++ b/tests/wf.nim @@ -1,7 +1,6 @@ when not declared(addFloat): import std/[formatfloat, typedthreads] when not declared(Thread): import std/threads -import std/[heapqueue, hashes, osproc, times], - adix/lptabz, cligen/[mfile, mslice, osUt], cligen +import std/[hashes,osproc,times], adix/lptabz, cligen/[mfile,mslice,osUt],cligen type Word = distinct uint32 Count = uint32 @@ -37,7 +36,7 @@ proc hash(w: Word): Hash {.inline.} = proc `==`(a, b: Word): bool {.inline.} = a.len == b.len and cmemcmp(a.mem, b.mem, a.len) == 0 -proc `<`(a, b: Word): bool {.inline.} = # for heapqueue +proc `<`(a, b: Word): bool {.inline.} = # for topk.push let c = cmemcmp(a.mem, b.mem, min(a.len, b.len)) if c == 0: a.len < b.len else: c < 0 @@ -94,18 +93,6 @@ proc count(p: int, path: string) = # split path into `p` ~equal segments else: work (parts[0].addr, hs[0].addr, nTs[0].addr) # ST-mode: No spawn else: stderr.write "wf: \"", path, "\" missing/irregular\n" -iterator top(h: Histo, n=10): (Word, Count) = - var q = initHeapQueue[(Count, Word)]() - for key, val in h: - let elem = (val, key) # maintain a heap.. - if q.len < n: q.push(elem) # ..of the biggest n items - elif elem > q[0]: discard q.replace(elem) - var y: (Word, Count) # yielded tuple - while q.len > 0: # q now has top n entries - let r = q.pop - y[0] = r[1]; y[1] = r[0] - yield y # yield in ASCENDING order - proc wf(path:seq[string], n=10, c=false, N=false, jobs=1, sz=9999, tm=false) = ## Parallel word frequency tool for one file < 128 MiB and words < 32 chars. ## Aggregate multiple via, e.g., `cat \*\*/\*.txt > /dev/shm/inp`. Similar @@ -122,9 +109,10 @@ proc wf(path:seq[string], n=10, c=false, N=false, jobs=1, sz=9999, tm=false) = for wd, cnt in hs[i]: hs[0].mgetOrPut(wd, 0) += cnt if c: echo hs[0].len," unique ",nTs[0]," total" template o = echo (if N: $(c.float/nTs[0].float) else: $c)," ",w - if n == 0: (for w, c in hs[0].pairs: o()) # unsorted whole - elif n > 0: (for w, c in hs[0].top(n): o()) # sorted top N - if tm: stderr.write epochTime() - t0, " sec\n" # n < 0 = only `c`/tm + if n == 0: (for w, c in hs[0].pairs: o()) # unsorted whole + elif n > 0 : (for w, c in hs[0].topByVal(n): o()) # unsorted top N + elif n < -1: (for w, c in hs[0].topByVal(n, order=Descending): o()) # sorted + if tm: stderr.write epochTime() - t0, " sec\n" # n == -1: only `c`/tm -dispatch(wf, help={"n": "print top n; 0=>all", "c": "count only", "N": "norm", +dispatch(wf, help={"n": "do top n; 0all,<0sort", "c": "count only", "N": "norm", "tm": "time", "jobs": "num threads; 0=>auto", "sz": "init size"}) diff --git a/tests/wfr.nim b/tests/wfr.nim index 92c8f21..ce22f9b 100644 --- a/tests/wfr.nim +++ b/tests/wfr.nim @@ -58,12 +58,13 @@ proc wfr(n=10, count=false,Norm=false, size=9999,dsize=81920, tm=false, Dlm="")= if count: echo h.len," unique ",nTot," total ",s.len," B" template output = if Norm: outu c.float/nTot.float," ",k,"\n" else: outu c," ",k,"\n" - if n == 0: (for (k, c) in pairs(h): output()) - elif n > 0: (for (k, c) in h.topByVal(n): output()) + if n == 0: (for (k, c) in pairs(h): output()) + elif n > 0 : (for (k, c) in h.topByVal(n): output()) + elif n < -1: (for (k, c) in h.topByVal(n, order=Descending): output()) if tm: stderr.write epochTime() - t0, "\n" when isMainModule: dispatch wfr, help={ - "n" : "only emit most frequent `n` lines(!=0=>sorted)", + "n" : "emit `n`-most common lines(0:all; <0 sorted)", "count": "only emit counts: unique & grand total", "Norm" : "normalize frequencies by dividing by grand tot", "size" : "pre-size hash table for size unique entries", diff --git a/util/lfreq.nim b/util/lfreq.nim index 5da516c..e01126d 100644 --- a/util/lfreq.nim +++ b/util/lfreq.nim @@ -1,6 +1,5 @@ when not declared(stdin): import std/[syncio, formatfloat] -import std/[hashes, times, sugar, algorithm, strutils], - cligen, cligen/[mslice, strUt, osUt], adix/oats +import std/[hashes,times,strutils], adix/oats, cligen/[mslice,strUt,osUt],cligen const bLen {.intdefine.} = 16 # <16K long; RT params better but more work const bOff {.intdefine.} = 32 # <4G UNIQUE line data @@ -29,7 +28,7 @@ when Counts is VROat[MSlice, MSlice, uint32]: {.warning: "Counts is a VROat"} proc incFailed(h: var Counts, ms: MSlice): bool = if ms.len > (1 shl bLen) - 1: # Careful to not overflow - erru "skipping too long line: ", ($ms)[0..<128], "\n" + erru "skipping too long line: ", ($ms)[0..<128], "...\n" return false # Cannot go on LOCALLY h.upSert(ms, i): # Found key @i: if h.dat[i].cnt == (1 shl bCnt) - 1: @@ -66,15 +65,13 @@ proc lfreq(n=10, count=false, size=9999, dSize=81920, recTerm='\n', elif format[id.a] == 'f': fs.setLen 0; fs.fcvt c.float*nInv, 9; outu fs else: outu MSlice(mem: format[call.a].addr, len: call.len) outu RecTerm - if n == 0: (for (k, c) in pairs(h): output()) - elif n > 0: (for (k, c) in h.topByVal(n): output()) - elif n < -1: # -1 is same as +1; Hijack value to mean no output() - var x = collect(for (k, c) in h.topByVal(-n): (k, c)) - x.reverse; (for (k, c) in x: output()) - if tm: stderr.write epochTime() - t0, "\n" + if n == 0: (for (k, c) in pairs(h): output()) + elif n > 0 : (for (k, c) in h.topByVal(n): output()) + elif n < -1: (for (k, c) in h.topByVal(-n, order=Descending): output()) + if tm: stderr.write epochTime() - t0, "\n" # -n-1 for only time output when isMainModule: dispatch lfreq, help={ - "n" : "only emit most frequent `n` lines(!=0=>sorted)", + "n" : "emit `n`-most common lines(0:all; <0 sorted)", "count": "only emit counts: unique & grand total", "size" : "pre-size hash table for size unique entries", "dSize": "pre-size str data area to this many bytes",