Use new adix/topk module to centralize/simplify, generalize (to asc,

desc, cheap), and speed-up such work in client code. For name hygiene, libraries `ditab`, `lptabz`, `oats` re-export only `topk.TopKOrder`. (But really things like `topByVal` that work on any Key-Value `pairs` should be lifted into their own adix/algos module anyway.) As part of the update, consistify `tests/wf|wfr`, `util/lfreq` Re: `-n` use & add a "..." for the overlong lines in `lfreq`.
c-blake · Jul 30, 2024 · b39f8c1 · b39f8c1
1 parent a4a0ce8
commit b39f8c1
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 77 deletions.
diff --git a/adix/ditab.nim b/adix/ditab.nim
@@ -21,7 +21,7 @@
 ## under another term like "direct indexing".  `K` below must have an available
 ## conversion to ``int``.  Duplicate keys cannot be allowed for this one.
 
-import althash, sequint, heapqueue
+import althash, sequint, topk; export topk.TopKOrder
 when not declared(assert): import std/assertions
 type
   DITab*[K,V] = object  ## Alphabet size determines `K`; `V` may be `void`
@@ -425,19 +425,10 @@ proc inc*[K,V: SomeInteger](t: var DITab[K,V], key: K,
 proc merge*[K,V: SomeInteger](c: var DITab[K,V], b: DITab[K,V]) =
   for key, val in b: c.inc(key, val)
 
-iterator topByVal*[K,V](c: DITab[K,V], n=10, min=V.low): (K, V) =
-  var q = initHeapQueue[(V, K)]()
-  for key, val in c:
-    if val >= min:
-      let e = (val, key)
-      if q.len < n: q.push(e)
-      elif e > q[0]: discard q.replace(e)
-  var y: (K, V)
-  while q.len > 0:        # q now has top n entries
-    let r = q.pop
-    y[0] = r[1]
-    y[1] = r[0]
-    yield y               # yield in ascending order
+iterator topByVal*[K,V](c: DITab[K,V], n=10, min=V.low, order=Cheap): (K, V) =
+  var t = initTopK[(V,K)](n)
+  for k, v in ditab.pairs(c): (if v >= min: t.push (v, k))
+  for e in topk.maybeOrdered(t, order): yield (e[1], e[0])
 
 proc initDISet*[K](initialSize=0, numer=diNumer, denom=diDenom,
                    minFree=diMinFree, growPow2=diGrowPow2, rehash=diRehash,

diff --git a/adix/lptabz.nim b/adix/lptabz.nim
@@ -33,8 +33,8 @@
 ## `seq[(K,V)]`.  6..8 bits avoids most "double cache misses" for miss
 ## lookups/inserts. `z=0` works if space matters more than time.
 
-import althash, memutil, bitop, heapqueue, sequint, strutils, memfiles
-export Hash, sequint
+import althash, memutil, bitop, topk, sequint, std/[strutils, memfiles]
+export Hash, sequint, topk.TopKOrder
 when not declared(assert): import std/[assertions, objectdollar]
 when declared(File):
   template stdOpen(x: varargs[untyped]): untyped = system.open(x)
@@ -1082,21 +1082,12 @@ proc merge*[K,V,Z;z:static int](c: var LPTabz[K,V,Z,z], b: LPTabz[K,V,Z,z]) =
   for key, val in b: c.inc(key, val)
 
 iterator topByVal*[K,V,Z;z:static int](c: LPTabz[K,V,Z,z], n=10,
-                                       min=V.low): (K, V) =
+                                       min=V.low, order=Cheap): (K, V) =
   ## Iterate from smallest to largest over biggest `n` items by value in `c`.
-  ## If `n==0` this is effectively heap sort of `c` by value `V`.
-  var q = initHeapQueue[(V, K)]()
-  for key, val in c:
-    if val >= min:
-      let e = (val, key)
-      if n == 0 or q.len < n: q.push(e)
-      elif e > q[0]: discard q.replace(e)
-  var y: (K, V)
-  while q.len > 0:        # q now has top n entries
-    let r = q.pop
-    y[0] = r[1]
-    y[1] = r[0]
-    yield y               # yield in ascending order
+  ## `order` can be `Cheap`, `Ascending`, or `Descending`.
+  var t = initTopK[(V,K)](n)
+  for k, v in lptabz.pairs(c): (if v >= min: t.push (v, k))
+  for e in topk.maybeOrdered(t, order): yield (e[1], e[0])
 
 iterator mostCommon*[K](xs: openArray[K], n=10): (K, int) =
   ## Iterate over (`n` most common values in `xs`, their counts) tuples.

diff --git a/adix/oats.nim b/adix/oats.nim
@@ -1,4 +1,4 @@
-import std/[hashes, heapqueue], adix/bitop
+import std/hashes, adix/[bitop, topk]; export topk.TopKOrder
 template pua*(T: typedesc): untyped = ptr UncheckedArray[T]
 # Since want no `setCap | pairs` to exist for fixed size | set-like `t`, what is
 # `concept` is driven by the external interface, but performance tweaks { like
@@ -159,22 +159,12 @@ iterator values*[K,Q,V](t: VOat[K,Q,V]): V =
 iterator pairs*[K,Q,V](t: VOat[K,Q,V]): (K, V) =
   for i in 0 ..< t.cap: (if t.used i: yield (t.key i, t.val i))
 
-iterator topByVal*[K,Q,V](s: VOat[K,Q,V], n=10, min=V.low): (K, V) =
+iterator topByVal*[K,Q,V](s: VOat[K,Q,V], n=10, min=V.low, order=topk.Cheap): (K, V)=
   ## Iterate from smallest to largest over biggest `n` items by value in `s`.
   ## If `n==0` this is effectively heapSort of `s` by value `V`.
-  proc `<`(a, b: (V,K)): bool = a[0] < b[0] # ignore K => only partial order
-  var q = initHeapQueue[(V,K)]()
-  for k, v in oats.pairs(s):
-    if v >= min:
-      let e = (v, k)
-      if n == 0 or q.len < n: q.push e
-      elif e > q[0]: discard q.replace(e)
-  var y: (K,V)
-  while q.len > 0:        # q now has top n entries
-    let r = q.pop
-    y[0] = r[1]
-    y[1] = r[0]
-    yield y               # yield in ascending order
+  var t = initTopK[(V,K)](n)
+  for k, v in oats.pairs(s): (if v >= min: t.push (v, k))
+  for e in topk.maybeOrdered(t, order): yield (e[1], e[0])
 
 template oatKStack*(s, Self, Cell, off, offT, K, Q) =
   ## Def routines for back-to-back/stacked variable length, unpadded key data.

diff --git a/tests/wf.nim b/tests/wf.nim
@@ -1,7 +1,6 @@
 when not declared(addFloat): import std/[formatfloat, typedthreads]
 when not declared(Thread): import std/threads
-import std/[heapqueue, hashes, osproc, times],
-       adix/lptabz, cligen/[mfile, mslice, osUt], cligen
+import std/[hashes,osproc,times], adix/lptabz, cligen/[mfile,mslice,osUt],cligen
 type
   Word   = distinct uint32
   Count  = uint32
@@ -37,7 +36,7 @@ proc hash(w: Word): Hash {.inline.} =
 proc `==`(a, b: Word): bool {.inline.} =
   a.len == b.len and cmemcmp(a.mem, b.mem, a.len) == 0
 
-proc `<`(a, b: Word): bool {.inline.} = # for heapqueue
+proc `<`(a, b: Word): bool {.inline.} = # for topk.push
   let c = cmemcmp(a.mem, b.mem, min(a.len, b.len))
   if c == 0: a.len < b.len else: c < 0
 
@@ -94,18 +93,6 @@ proc count(p: int, path: string) =      # split path into `p` ~equal segments
     else: work (parts[0].addr, hs[0].addr, nTs[0].addr) # ST-mode: No spawn
   else: stderr.write "wf: \"", path, "\" missing/irregular\n"
 
-iterator top(h: Histo, n=10): (Word, Count) =
-  var q = initHeapQueue[(Count, Word)]()
-  for key, val in h:
-    let elem = (val, key)               # maintain a heap..
-    if q.len < n: q.push(elem)          # ..of the biggest n items
-    elif elem > q[0]: discard q.replace(elem)
-  var y: (Word, Count)                  # yielded tuple
-  while q.len > 0:                      # q now has top n entries
-    let r = q.pop
-    y[0] = r[1]; y[1] = r[0]
-    yield y                             # yield in ASCENDING order
-
 proc wf(path:seq[string], n=10, c=false, N=false, jobs=1, sz=9999, tm=false) =
   ## Parallel word frequency tool for one file < 128 MiB and words < 32 chars.
   ## Aggregate multiple via, e.g., `cat \*\*/\*.txt > /dev/shm/inp`.  Similar
@@ -122,9 +109,10 @@ proc wf(path:seq[string], n=10, c=false, N=false, jobs=1, sz=9999, tm=false) =
     for wd, cnt in hs[i]: hs[0].mgetOrPut(wd, 0) += cnt
   if c: echo hs[0].len," unique ",nTs[0]," total"
   template o = echo (if N: $(c.float/nTs[0].float) else: $c)," ",w
-  if  n == 0: (for w, c in hs[0].pairs: o())      # unsorted whole
-  elif n > 0: (for w, c in hs[0].top(n): o())     # sorted top N
-  if tm: stderr.write epochTime() - t0, " sec\n"  # n < 0 = only `c`/tm
+  if   n == 0: (for w, c in hs[0].pairs: o())       # unsorted whole
+  elif n > 0 : (for w, c in hs[0].topByVal(n): o()) # unsorted top N
+  elif n < -1: (for w, c in hs[0].topByVal(n, order=Descending): o()) # sorted
+  if tm: stderr.write epochTime() - t0, " sec\n"    # n == -1: only `c`/tm
 
-dispatch(wf, help={"n": "print top n; 0=>all", "c": "count only", "N": "norm",
+dispatch(wf, help={"n": "do top n; 0all,<0sort", "c": "count only", "N": "norm",
   "tm": "time", "jobs": "num threads; 0=>auto", "sz": "init size"})
diff --git a/tests/wfr.nim b/tests/wfr.nim
@@ -58,12 +58,13 @@ proc wfr(n=10, count=false,Norm=false, size=9999,dsize=81920, tm=false, Dlm="")=
   if count: echo h.len," unique ",nTot," total ",s.len," B"
   template output =
     if Norm: outu c.float/nTot.float," ",k,"\n" else: outu c," ",k,"\n"
-  if n == 0: (for (k, c) in pairs(h): output())
-  elif n > 0: (for (k, c) in h.topByVal(n): output())
+  if   n == 0: (for (k, c) in pairs(h): output())
+  elif n > 0 : (for (k, c) in h.topByVal(n): output())
+  elif n < -1: (for (k, c) in h.topByVal(n, order=Descending): output())
   if tm: stderr.write epochTime() - t0, "\n"
 
 when isMainModule: dispatch wfr, help={
-  "n"    : "only emit most frequent `n` lines(!=0=>sorted)",
+  "n"    : "emit `n`-most common  lines(0:all; <0 sorted)",
   "count": "only emit counts: unique & grand total",
   "Norm" : "normalize frequencies by dividing by grand tot",
   "size" : "pre-size hash table for size unique entries",

diff --git a/util/lfreq.nim b/util/lfreq.nim
@@ -1,6 +1,5 @@
 when not declared(stdin): import std/[syncio, formatfloat]
-import std/[hashes, times, sugar, algorithm, strutils],
-       cligen, cligen/[mslice, strUt, osUt], adix/oats
+import std/[hashes,times,strutils], adix/oats, cligen/[mslice,strUt,osUt],cligen
 
 const bLen {.intdefine.} = 16   # <16K long;  RT params better but more work
 const bOff {.intdefine.} = 32   # <4G UNIQUE line data
@@ -29,7 +28,7 @@ when Counts is VROat[MSlice, MSlice, uint32]: {.warning: "Counts is a VROat"}
 
 proc incFailed(h: var Counts, ms: MSlice): bool =
   if ms.len > (1 shl bLen) - 1: # Careful to not overflow
-    erru "skipping too long line: ", ($ms)[0..<128], "\n"
+    erru "skipping too long line: ", ($ms)[0..<128], "...\n"
     return false                # Cannot go on LOCALLY
   h.upSert(ms, i):              # Found key @i:
     if h.dat[i].cnt == (1 shl bCnt) - 1:
@@ -66,15 +65,13 @@ proc lfreq(n=10, count=false, size=9999, dSize=81920, recTerm='\n',
       elif format[id.a] == 'f': fs.setLen 0; fs.fcvt c.float*nInv, 9; outu fs
       else: outu MSlice(mem: format[call.a].addr, len: call.len)
     outu RecTerm
-  if n == 0: (for (k, c) in pairs(h): output())
-  elif n > 0: (for (k, c) in h.topByVal(n): output())
-  elif n < -1:  # -1 is same as +1; Hijack value to mean no output()
-    var x = collect(for (k, c) in h.topByVal(-n): (k, c))
-    x.reverse; (for (k, c) in x: output())
-  if tm: stderr.write epochTime() - t0, "\n"
+  if   n == 0: (for (k, c) in pairs(h): output())
+  elif n > 0 : (for (k, c) in h.topByVal(n): output())
+  elif n < -1: (for (k, c) in h.topByVal(-n, order=Descending): output())
+  if tm: stderr.write epochTime() - t0, "\n"  # -n-1 for only time output
 
 when isMainModule: dispatch lfreq, help={
-  "n"    : "only emit most frequent `n` lines(!=0=>sorted)",
+  "n"    : "emit `n`-most common  lines(0:all; <0 sorted)",
   "count": "only emit counts: unique & grand total",
   "size" : "pre-size hash table for size unique entries",
   "dSize": "pre-size str data area to this many bytes",