Skip to content

Commit

Permalink
Use new adix/topk module to centralize/simplify, generalize (to asc,
Browse files Browse the repository at this point in the history
desc, cheap), and speed-up such work in client code.  For name hygiene,
libraries `ditab`, `lptabz`, `oats` re-export only `topk.TopKOrder`.
(But really things like `topByVal` that work on any Key-Value `pairs`
should be lifted into their own adix/algos module anyway.)

As part of the update, consistify `tests/wf|wfr`, `util/lfreq` Re: `-n`
use & add a "..." for the overlong lines in `lfreq`.
  • Loading branch information
c-blake committed Jul 30, 2024
1 parent a4a0ce8 commit b39f8c1
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 77 deletions.
19 changes: 5 additions & 14 deletions adix/ditab.nim
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
## under another term like "direct indexing". `K` below must have an available
## conversion to ``int``. Duplicate keys cannot be allowed for this one.

import althash, sequint, heapqueue
import althash, sequint, topk; export topk.TopKOrder
when not declared(assert): import std/assertions
type
DITab*[K,V] = object ## Alphabet size determines `K`; `V` may be `void`
Expand Down Expand Up @@ -425,19 +425,10 @@ proc inc*[K,V: SomeInteger](t: var DITab[K,V], key: K,
proc merge*[K,V: SomeInteger](c: var DITab[K,V], b: DITab[K,V]) =
for key, val in b: c.inc(key, val)

iterator topByVal*[K,V](c: DITab[K,V], n=10, min=V.low): (K, V) =
var q = initHeapQueue[(V, K)]()
for key, val in c:
if val >= min:
let e = (val, key)
if q.len < n: q.push(e)
elif e > q[0]: discard q.replace(e)
var y: (K, V)
while q.len > 0: # q now has top n entries
let r = q.pop
y[0] = r[1]
y[1] = r[0]
yield y # yield in ascending order
iterator topByVal*[K,V](c: DITab[K,V], n=10, min=V.low, order=Cheap): (K, V) =
var t = initTopK[(V,K)](n)
for k, v in ditab.pairs(c): (if v >= min: t.push (v, k))
for e in topk.maybeOrdered(t, order): yield (e[1], e[0])

proc initDISet*[K](initialSize=0, numer=diNumer, denom=diDenom,
minFree=diMinFree, growPow2=diGrowPow2, rehash=diRehash,
Expand Down
23 changes: 7 additions & 16 deletions adix/lptabz.nim
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
## `seq[(K,V)]`. 6..8 bits avoids most "double cache misses" for miss
## lookups/inserts. `z=0` works if space matters more than time.

import althash, memutil, bitop, heapqueue, sequint, strutils, memfiles
export Hash, sequint
import althash, memutil, bitop, topk, sequint, std/[strutils, memfiles]
export Hash, sequint, topk.TopKOrder
when not declared(assert): import std/[assertions, objectdollar]
when declared(File):
template stdOpen(x: varargs[untyped]): untyped = system.open(x)
Expand Down Expand Up @@ -1082,21 +1082,12 @@ proc merge*[K,V,Z;z:static int](c: var LPTabz[K,V,Z,z], b: LPTabz[K,V,Z,z]) =
for key, val in b: c.inc(key, val)

iterator topByVal*[K,V,Z;z:static int](c: LPTabz[K,V,Z,z], n=10,
min=V.low): (K, V) =
min=V.low, order=Cheap): (K, V) =
## Iterate from smallest to largest over biggest `n` items by value in `c`.
## If `n==0` this is effectively heap sort of `c` by value `V`.
var q = initHeapQueue[(V, K)]()
for key, val in c:
if val >= min:
let e = (val, key)
if n == 0 or q.len < n: q.push(e)
elif e > q[0]: discard q.replace(e)
var y: (K, V)
while q.len > 0: # q now has top n entries
let r = q.pop
y[0] = r[1]
y[1] = r[0]
yield y # yield in ascending order
## `order` can be `Cheap`, `Ascending`, or `Descending`.
var t = initTopK[(V,K)](n)
for k, v in lptabz.pairs(c): (if v >= min: t.push (v, k))
for e in topk.maybeOrdered(t, order): yield (e[1], e[0])

iterator mostCommon*[K](xs: openArray[K], n=10): (K, int) =
## Iterate over (`n` most common values in `xs`, their counts) tuples.
Expand Down
20 changes: 5 additions & 15 deletions adix/oats.nim
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import std/[hashes, heapqueue], adix/bitop
import std/hashes, adix/[bitop, topk]; export topk.TopKOrder
template pua*(T: typedesc): untyped = ptr UncheckedArray[T]
# Since want no `setCap | pairs` to exist for fixed size | set-like `t`, what is
# `concept` is driven by the external interface, but performance tweaks { like
Expand Down Expand Up @@ -159,22 +159,12 @@ iterator values*[K,Q,V](t: VOat[K,Q,V]): V =
iterator pairs*[K,Q,V](t: VOat[K,Q,V]): (K, V) =
for i in 0 ..< t.cap: (if t.used i: yield (t.key i, t.val i))

iterator topByVal*[K,Q,V](s: VOat[K,Q,V], n=10, min=V.low): (K, V) =
iterator topByVal*[K,Q,V](s: VOat[K,Q,V], n=10, min=V.low, order=topk.Cheap): (K, V)=
## Iterate from smallest to largest over biggest `n` items by value in `s`.
## If `n==0` this is effectively heapSort of `s` by value `V`.
proc `<`(a, b: (V,K)): bool = a[0] < b[0] # ignore K => only partial order
var q = initHeapQueue[(V,K)]()
for k, v in oats.pairs(s):
if v >= min:
let e = (v, k)
if n == 0 or q.len < n: q.push e
elif e > q[0]: discard q.replace(e)
var y: (K,V)
while q.len > 0: # q now has top n entries
let r = q.pop
y[0] = r[1]
y[1] = r[0]
yield y # yield in ascending order
var t = initTopK[(V,K)](n)
for k, v in oats.pairs(s): (if v >= min: t.push (v, k))
for e in topk.maybeOrdered(t, order): yield (e[1], e[0])

template oatKStack*(s, Self, Cell, off, offT, K, Q) =
## Def routines for back-to-back/stacked variable length, unpadded key data.
Expand Down
26 changes: 7 additions & 19 deletions tests/wf.nim
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
when not declared(addFloat): import std/[formatfloat, typedthreads]
when not declared(Thread): import std/threads
import std/[heapqueue, hashes, osproc, times],
adix/lptabz, cligen/[mfile, mslice, osUt], cligen
import std/[hashes,osproc,times], adix/lptabz, cligen/[mfile,mslice,osUt],cligen
type
Word = distinct uint32
Count = uint32
Expand Down Expand Up @@ -37,7 +36,7 @@ proc hash(w: Word): Hash {.inline.} =
proc `==`(a, b: Word): bool {.inline.} =
a.len == b.len and cmemcmp(a.mem, b.mem, a.len) == 0

proc `<`(a, b: Word): bool {.inline.} = # for heapqueue
proc `<`(a, b: Word): bool {.inline.} = # for topk.push
let c = cmemcmp(a.mem, b.mem, min(a.len, b.len))
if c == 0: a.len < b.len else: c < 0

Expand Down Expand Up @@ -94,18 +93,6 @@ proc count(p: int, path: string) = # split path into `p` ~equal segments
else: work (parts[0].addr, hs[0].addr, nTs[0].addr) # ST-mode: No spawn
else: stderr.write "wf: \"", path, "\" missing/irregular\n"

iterator top(h: Histo, n=10): (Word, Count) =
var q = initHeapQueue[(Count, Word)]()
for key, val in h:
let elem = (val, key) # maintain a heap..
if q.len < n: q.push(elem) # ..of the biggest n items
elif elem > q[0]: discard q.replace(elem)
var y: (Word, Count) # yielded tuple
while q.len > 0: # q now has top n entries
let r = q.pop
y[0] = r[1]; y[1] = r[0]
yield y # yield in ASCENDING order

proc wf(path:seq[string], n=10, c=false, N=false, jobs=1, sz=9999, tm=false) =
## Parallel word frequency tool for one file < 128 MiB and words < 32 chars.
## Aggregate multiple via, e.g., `cat \*\*/\*.txt > /dev/shm/inp`. Similar
Expand All @@ -122,9 +109,10 @@ proc wf(path:seq[string], n=10, c=false, N=false, jobs=1, sz=9999, tm=false) =
for wd, cnt in hs[i]: hs[0].mgetOrPut(wd, 0) += cnt
if c: echo hs[0].len," unique ",nTs[0]," total"
template o = echo (if N: $(c.float/nTs[0].float) else: $c)," ",w
if n == 0: (for w, c in hs[0].pairs: o()) # unsorted whole
elif n > 0: (for w, c in hs[0].top(n): o()) # sorted top N
if tm: stderr.write epochTime() - t0, " sec\n" # n < 0 = only `c`/tm
if n == 0: (for w, c in hs[0].pairs: o()) # unsorted whole
elif n > 0 : (for w, c in hs[0].topByVal(n): o()) # unsorted top N
elif n < -1: (for w, c in hs[0].topByVal(n, order=Descending): o()) # sorted
if tm: stderr.write epochTime() - t0, " sec\n" # n == -1: only `c`/tm

dispatch(wf, help={"n": "print top n; 0=>all", "c": "count only", "N": "norm",
dispatch(wf, help={"n": "do top n; 0all,<0sort", "c": "count only", "N": "norm",
"tm": "time", "jobs": "num threads; 0=>auto", "sz": "init size"})
7 changes: 4 additions & 3 deletions tests/wfr.nim
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,13 @@ proc wfr(n=10, count=false,Norm=false, size=9999,dsize=81920, tm=false, Dlm="")=
if count: echo h.len," unique ",nTot," total ",s.len," B"
template output =
if Norm: outu c.float/nTot.float," ",k,"\n" else: outu c," ",k,"\n"
if n == 0: (for (k, c) in pairs(h): output())
elif n > 0: (for (k, c) in h.topByVal(n): output())
if n == 0: (for (k, c) in pairs(h): output())
elif n > 0 : (for (k, c) in h.topByVal(n): output())
elif n < -1: (for (k, c) in h.topByVal(n, order=Descending): output())
if tm: stderr.write epochTime() - t0, "\n"

when isMainModule: dispatch wfr, help={
"n" : "only emit most frequent `n` lines(!=0=>sorted)",
"n" : "emit `n`-most common lines(0:all; <0 sorted)",
"count": "only emit counts: unique & grand total",
"Norm" : "normalize frequencies by dividing by grand tot",
"size" : "pre-size hash table for size unique entries",
Expand Down
17 changes: 7 additions & 10 deletions util/lfreq.nim
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
when not declared(stdin): import std/[syncio, formatfloat]
import std/[hashes, times, sugar, algorithm, strutils],
cligen, cligen/[mslice, strUt, osUt], adix/oats
import std/[hashes,times,strutils], adix/oats, cligen/[mslice,strUt,osUt],cligen

const bLen {.intdefine.} = 16 # <16K long; RT params better but more work
const bOff {.intdefine.} = 32 # <4G UNIQUE line data
Expand Down Expand Up @@ -29,7 +28,7 @@ when Counts is VROat[MSlice, MSlice, uint32]: {.warning: "Counts is a VROat"}

proc incFailed(h: var Counts, ms: MSlice): bool =
if ms.len > (1 shl bLen) - 1: # Careful to not overflow
erru "skipping too long line: ", ($ms)[0..<128], "\n"
erru "skipping too long line: ", ($ms)[0..<128], "...\n"
return false # Cannot go on LOCALLY
h.upSert(ms, i): # Found key @i:
if h.dat[i].cnt == (1 shl bCnt) - 1:
Expand Down Expand Up @@ -66,15 +65,13 @@ proc lfreq(n=10, count=false, size=9999, dSize=81920, recTerm='\n',
elif format[id.a] == 'f': fs.setLen 0; fs.fcvt c.float*nInv, 9; outu fs
else: outu MSlice(mem: format[call.a].addr, len: call.len)
outu RecTerm
if n == 0: (for (k, c) in pairs(h): output())
elif n > 0: (for (k, c) in h.topByVal(n): output())
elif n < -1: # -1 is same as +1; Hijack value to mean no output()
var x = collect(for (k, c) in h.topByVal(-n): (k, c))
x.reverse; (for (k, c) in x: output())
if tm: stderr.write epochTime() - t0, "\n"
if n == 0: (for (k, c) in pairs(h): output())
elif n > 0 : (for (k, c) in h.topByVal(n): output())
elif n < -1: (for (k, c) in h.topByVal(-n, order=Descending): output())
if tm: stderr.write epochTime() - t0, "\n" # -n-1 for only time output

when isMainModule: dispatch lfreq, help={
"n" : "only emit most frequent `n` lines(!=0=>sorted)",
"n" : "emit `n`-most common lines(0:all; <0 sorted)",
"count": "only emit counts: unique & grand total",
"size" : "pre-size hash table for size unique entries",
"dSize": "pre-size str data area to this many bytes",
Expand Down

0 comments on commit b39f8c1

Please sign in to comment.