From 8e97ef3fb3a8b8b85308ffb420db71763bdf5773 Mon Sep 17 00:00:00 2001
From: Jonathan Chappelow <jon@kwil.com>
Date: Thu, 22 Aug 2024 13:50:19 -0500
Subject: [PATCH] pg: stats, mvcs, histo

---
 common/sql/statistics.go             | 143 ++++++-
 core/types/uuid.go                   |   8 +
 internal/sql/pg/db_live_test.go      |  36 +-
 internal/sql/pg/histo.go             | 323 ++++++++++++++++
 internal/sql/pg/histo_test.go        | 538 ++++++++++++++++++++++++++
 internal/sql/pg/stats.go             | 547 +++++++++++++++++++++++++--
 internal/sql/pg/stats_enc.go         |  79 ++++
 internal/sql/pg/stats_enc_test.go    | 143 +++++++
 internal/sql/pg/stats_test.go        | 141 ++++++-
 internal/sql/pg/stats_update_test.go | 121 ++++++
 internal/sql/pg/system.go            | 120 +++++-
 internal/sql/pg/system_test.go       | 193 ++++++++++
 internal/sql/pg/types_test.go        |  29 ++
 13 files changed, 2315 insertions(+), 106 deletions(-)
 create mode 100644 internal/sql/pg/histo.go
 create mode 100644 internal/sql/pg/histo_test.go
 create mode 100644 internal/sql/pg/stats_enc.go
 create mode 100644 internal/sql/pg/stats_enc_test.go
 create mode 100644 internal/sql/pg/stats_update_test.go

diff --git a/common/sql/statistics.go b/common/sql/statistics.go
index 7b38ea61c..6e0ca05f1 100644
--- a/common/sql/statistics.go
+++ b/common/sql/statistics.go
@@ -15,8 +15,7 @@ type Statistics struct {
 	RowCount int64
 
 	ColumnStatistics []ColumnStatistics
-
-	//Selectivity, for plan statistics
+	// NOTE: above may be better as []any to work with a generic ColStatsT[T]
 }
 
 func (s *Statistics) String() string {
@@ -27,17 +26,18 @@ func (s *Statistics) String() string {
 	}
 	for i, cs := range s.ColumnStatistics {
 		fmt.Fprintf(&st, " Column %d:\n", i)
-		fmt.Fprintf(&st, " - Min/Max = %v / %v\n", cs.Min, cs.Max)
+		if _, ok := cs.Min.(string); ok {
+			fmt.Fprintf(&st, " - Min/Max = %.64s / %.64s\n", cs.Min, cs.Max)
+		} else {
+			fmt.Fprintf(&st, " - Min/Max = %v / %v\n", cs.Min, cs.Max)
+		}
 		fmt.Fprintf(&st, " - NULL count = %v\n", cs.NullCount)
+		fmt.Fprintf(&st, " - Num MCVs = %v\n", len(cs.MCFreqs))
+		fmt.Fprintf(&st, " - Histogram = {%v}\n", cs.Histogram) // it's any, but also a fmt.Stringer
 	}
 	return st.String()
 }
 
-type ValCount struct {
-	Val   any
-	Count int
-}
-
 // ColumnStatistics contains statistics about a column.
 type ColumnStatistics struct {
 	NullCount int64
@@ -58,16 +58,133 @@ type ColumnStatistics struct {
 	// MCVs []ValCount
 	// MCVs map[cmp.Ordered]
 
-	// MCVals  []any
-	// MCFreqs []int
+	MCVals  []any // []T
+	MCFreqs []int
+	// ^ NOTE: MCVals was easier in many ways with just any.([]T), but other
+	// ways much more inconvenient, so we have it as an []any.  May go back.
 
 	// DistinctCount is harder. For example, unless we sub-sample
 	// (deterministically), tracking distinct values could involve a data
 	// structure with the same number of elements as rows in the table.
-	DistinctCount int64
+	// or sophisticated a algo e.g. https://github.com/axiomhq/hyperloglog
+	// DistinctCount int64
+	// alt, -1 means don't know
 
+	// AvgSize can affect cost as it changes the number of "pages" in postgres
+	// terminology, representing the size of data returned or processed by an
+	// expression.
 	AvgSize int64 // maybe: length of text, length of array, otherwise not used for scalar?
 
-	// without histogram, we can make uniformity assumption to simplify the cost model
-	//Histogram     []HistogramBucket
+	Histogram any // histo[T]
+}
+
+/* Perhaps I should have started fresh with a fully generic column stats struct... under consideration.
+
+type ColStatsT[T any] struct {
+	NullCount int
+	Min       T
+	MinCount  int
+	Max       T
+	MaxCount  int
+	MCVals    []T
+	MCFreqs   []int
+}
+*/
+
+func NewEmptyStatistics(numCols int) *Statistics {
+	return &Statistics{
+		RowCount:         0,
+		ColumnStatistics: make([]ColumnStatistics, numCols),
+	}
+}
+
+// ALL of the following types are from the initial query plan draft PR by Yaiba.
+// Only TableRef gets much use in the current statistics work. An integration
+// branch uses the other field and schema types a bit more, but it's easy to
+// change any of this...
+
+// TableRef is a PostgreSQL-schema-qualified table name.
+type TableRef struct {
+	Namespace string // e.g. schema in Postgres, derived from Kwil dataset schema DBID
+	Table     string
+}
+
+// String returns the fully qualified table name as "namepace.table" if
+// Namespace is set, otherwise it just returns the table name.
+func (t *TableRef) String() string {
+	if t.Namespace != "" {
+		return fmt.Sprintf("%s.%s", t.Namespace, t.Table)
+	}
+	return t.Table
+}
+
+type ColumnDef struct {
+	Relation *TableRef
+	Name     string
+}
+
+func ColumnUnqualified(name string) *ColumnDef {
+	return &ColumnDef{Name: name}
+}
+
+func Column(table *TableRef, name string) *ColumnDef {
+	return &ColumnDef{Relation: table, Name: name}
+}
+
+// Field represents a field (column) in a schema.
+type Field struct {
+	Rel *TableRef
+
+	Name     string
+	Type     string
+	Nullable bool
+	HasIndex bool
+}
+
+func NewField(name string, dataType string, nullable bool) Field {
+	return Field{Name: name, Type: dataType, Nullable: nullable}
+}
+
+func NewFieldWithRelation(name string, dataType string, nullable bool, relation *TableRef) Field {
+	return Field{Name: name, Type: dataType, Nullable: nullable, Rel: relation}
+}
+
+func (f *Field) Relation() *TableRef {
+	return f.Rel
+}
+
+func (f *Field) QualifiedColumn() *ColumnDef {
+	return Column(f.Rel, f.Name)
+}
+
+// Schema represents a database as a slice of all columns in all relations. See
+// also Field.
+type Schema struct {
+	Fields []Field
+}
+
+func NewSchema(fields ...Field) *Schema {
+	return &Schema{Fields: fields}
+}
+
+func NewSchemaQualified(relation *TableRef, fields ...Field) *Schema {
+	for i := range fields {
+		fields[i].Rel = relation
+	}
+	return &Schema{Fields: fields}
+}
+
+func (s *Schema) String() string {
+	var fields []string
+	for _, f := range s.Fields {
+		fields = append(fields, fmt.Sprintf("%s/%s", f.Name, f.Type))
+	}
+	return fmt.Sprintf("[%s]", strings.Join(fields, ", "))
+}
+
+type DataSource interface {
+	// Schema returns the schema for the underlying data source
+	Schema() *Schema
+	// Statistics returns the statistics of the data source.
+	Statistics() *Statistics
 }
diff --git a/core/types/uuid.go b/core/types/uuid.go
index a45c740b4..9a7b1fbc4 100644
--- a/core/types/uuid.go
+++ b/core/types/uuid.go
@@ -1,6 +1,7 @@
 package types
 
 import (
+	"bytes"
 	"database/sql"
 	"database/sql/driver"
 	"encoding/json"
@@ -14,6 +15,13 @@ var namespace = uuid.MustParse("cc1cd90f-b4db-47f4-b6df-4bbe5fca88eb")
 // UUID is a rfc4122 compliant uuidv5
 type UUID [16]byte
 
+// CmpUUID compares two UUIDs, returning 0 if equal, -1 if u<v, and 1 if u>v.
+// This satisfies the comparison function required by many generic functions in
+// the standard library and Kwil.
+func CmpUUID(u, v UUID) int {
+	return bytes.Compare(u[:], v[:])
+}
+
 // NewUUIDV5 generates a uuidv5 from a byte slice.
 // This is used to deterministically generate uuids.
 func NewUUIDV5(from []byte) *UUID {
diff --git a/internal/sql/pg/db_live_test.go b/internal/sql/pg/db_live_test.go
index 20a58f935..92bc2d462 100644
--- a/internal/sql/pg/db_live_test.go
+++ b/internal/sql/pg/db_live_test.go
@@ -7,6 +7,7 @@ import (
 	"cmp"
 	"context"
 	"fmt"
+	"os"
 	"reflect"
 	"slices"
 	"strconv"
@@ -28,7 +29,7 @@ import (
 
 func TestMain(m *testing.M) {
 	// UseLogger(log.NewStdOut(log.InfoLevel))
-	m.Run()
+	os.Exit(m.Run())
 }
 
 const (
@@ -266,13 +267,6 @@ func TestNULL(t *testing.T) {
 	require.Equal(t, bvn.Int64, insB)
 }
 
-// typeFor returns the reflect.Type that represents the type argument T. TODO:
-// Remove this in favor of reflect.TypeFor when Go 1.22 becomes the minimum
-// required version since it is not available in Go 1.21.
-func typeFor[T any]() reflect.Type {
-	return reflect.TypeOf((*T)(nil)).Elem()
-}
-
 func TestScanVal(t *testing.T) {
 	cols := []ColInfo{
 		{Pos: 1, Name: "a", DataType: "bigint", Nullable: false},
@@ -974,32 +968,6 @@ func TestTypeRoundtrip(t *testing.T) {
 	}
 }
 
-// mustDecimal panics if the string cannot be converted to a decimal.
-func mustDecimal(s string) *decimal.Decimal {
-	d, err := decimal.NewFromString(s)
-	if err != nil {
-		panic(err)
-	}
-	return d
-}
-
-func mustParseUUID(s string) *types.UUID {
-	u, err := types.ParseUUID(s)
-	if err != nil {
-		panic(err)
-	}
-	return u
-}
-
-// mustUint256 panics if the string cannot be converted to a Uint256.
-func mustUint256(s string) *types.Uint256 {
-	u, err := types.Uint256FromString(s)
-	if err != nil {
-		panic(err)
-	}
-	return u
-}
-
 func Test_DelayedTx(t *testing.T) {
 	ctx := context.Background()
 
diff --git a/internal/sql/pg/histo.go b/internal/sql/pg/histo.go
new file mode 100644
index 000000000..cd741864d
--- /dev/null
+++ b/internal/sql/pg/histo.go
@@ -0,0 +1,323 @@
+package pg
+
+// This file defines a generic histogram type, and many of the interpolation
+// functions necessary to use it and create its bounds.
+
+import (
+	"fmt"
+	"math"
+	"math/big"
+	"slices"
+	"strings"
+
+	"github.com/kwilteam/kwil-db/core/types"
+	"github.com/kwilteam/kwil-db/core/types/decimal"
+)
+
+type histo[T any] struct {
+	// Normally given N boundaries, there would be N-1 bins/freqs.
+	// In this histogram, there are two special bins that catch all
+	// below the lowest bin boundary, and above the highest bin boundary.
+	// This is done because we need to decide the boundaries before the
+	// actual min and max are known, *and* we have to limit the number of bins.
+	//
+	// bounds:       h_0     h_1     h_2
+	//                |       |       |
+	// freqs:   f_0   |  f_1  |  f_2  |  f_3
+	//
+	// bins include values on (h_i,h_j]
+
+	bounds []T   // len n
+	freqs  []int // len n+1
+
+	comp func(a, b T) int // -1 for a<b, 0 for a==b, 1 for a>b
+
+	// For more accurate summation up to a given value that is not also equal to
+	// one of the bounds, linear interpolation can be used. (TODO)
+	// interp func(f float64, a, b T) T
+	// interpF func(v, a, b T) float64 // (b-v)/(b-a) -- [a...v...b]
+}
+
+// ins adds an observed value into the appropriate bin and returns the index of
+// the updated bin.
+func (h histo[T]) ins(v T) int {
+	loc, _ := slices.BinarySearchFunc(h.bounds, v, h.comp)
+	h.freqs[loc]++
+	return loc
+}
+
+// rm removes an observed value from the appropriate bin and returns the index
+// of the updated bin.
+func (h histo[T]) rm(v T) int {
+	loc, _ := slices.BinarySearchFunc(h.bounds, v, h.comp)
+	h.freqs[loc]--
+	if f := h.freqs[loc]; f < 0 {
+		panic("accounting error -- negative bin count on rm")
+	}
+	return loc
+}
+
+func (h *histo[T]) TotalCount() int {
+	var total int
+	for _, f := range h.freqs {
+		total += f
+	}
+	return total
+}
+
+func (h histo[T]) String() string {
+	totalFreq := h.TotalCount()
+	return fmt.Sprintf("total = %d, bounds = %v, freqs = %v",
+		totalFreq, h.bounds, h.freqs)
+}
+
+// ltTotal returns the cumulative frequency for values less than (or equal) to
+// the given value. There will be a host of methods along these lines (e.g.
+// range, greater, equal) to support selectivity computation.
+//
+// Presently this is a simple summation, but it should be updated to perform
+// interpolation when a value is not also exactly a boundary.
+func (h histo[T]) ltTotal(v T) int { //nolint:unused
+	loc, _ := slices.BinarySearchFunc(h.bounds, v, h.comp)
+
+	var freq int
+	for i, f := range h.freqs {
+
+		if i == loc {
+			/*if found { // no interp from next bin, just add this freq and break
+				freq += f
+			} else { // the value is somewhere between bins (before bin i) => linearly interpolate
+				freq += int(float64(f) * h.interp(v, h.bounds[i-1], h.bounds[i]))
+			}*/
+			freq += f
+			break
+		}
+
+		freq += f
+	}
+	return freq
+}
+
+type SignedInt interface {
+	~int | ~int8 | ~int16 | ~int32 | ~int64
+}
+
+type UnsignedInt interface {
+	~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64
+}
+
+type Num interface {
+	SignedInt | UnsignedInt | float32 | float64
+}
+
+// The following interpolation functions are used to create histogram bounds
+// from min and max values. They will also be used for partial summation.
+
+func interpNumF[T Num](f float64, a, b T) float64 {
+	return f*float64(b) + (1-f)*float64(a)
+	// return float64(a) + f*(float64(b)-float64(a))
+}
+
+func interpNum[T Num](f float64, a, b T) T {
+	return T(interpNumF(f, a, b))
+	// return a + T(f*(float64(b)-float64(a)))
+}
+
+func interpBig(f float64, a, b *big.Int) *big.Int {
+	if b.Cmp(a) <= 0 {
+		panic("b must not be less than a")
+	}
+
+	// return a number on [a,b] computed via interpolation with f on [0,1]
+	// representing where between the two numbers.
+
+	diff := new(big.Int).Sub(b, a)
+	frac := new(big.Float).SetPrec(big.MaxPrec).SetInt(diff)
+	frac = frac.Mul(frac, big.NewFloat(f).SetPrec(big.MaxPrec)) // f*(b-a)
+
+	// a + frac
+	fracInt, _ := frac.Int(nil)
+
+	return new(big.Int).Add(a, fracInt)
+}
+
+// interpUint256 and interpDec below are both defined in terms of interpBig.
+
+func interpUint256(f float64, a, b *types.Uint256) *types.Uint256 {
+	c := interpBig(f, a.ToBig(), b.ToBig())
+	d, err := types.Uint256FromBig(c)
+	if err != nil { // shouldn't even possible if a and b were not NaNs
+		panic(err.Error())
+	}
+	return d
+}
+
+func interpDec(f float64, a, b *decimal.Decimal) *decimal.Decimal {
+	c := interpBig(f, a.BigInt(), b.BigInt())
+	// d, err := decimal.NewFromBigInt(c, a.Exp())
+	// d.SetPrecisionAndScale(a.Precision(), a.Scale())
+	d, err := decimal.NewExplicit(c.String(), a.Precision(), a.Scale())
+	if err != nil {
+		panic(err.Error())
+	}
+	// This is messier with Decimal's math methods, and I don't yet know if
+	// there'd be a benefit:
+	// bma, err := decimal.Sub(b, a) // etc
+	return d
+}
+
+func interpBts(f float64, a, b []byte) []byte {
+	ai := big.NewInt(0).SetBytes(a)
+	bi := big.NewInt(0).SetBytes(b)
+	return interpBig(f, ai, bi).Bytes()
+}
+
+func interpUUID(f float64, a, b types.UUID) types.UUID {
+	return types.UUID(interpBts(f, a[:], b[:]))
+}
+
+// interpBool is largely nonsense and should be unused as there should not ever
+// be a boolean histogram, but it is here for completeness. Could just panic...
+func interpBool(f float64, a, b bool) bool {
+	if f < 0.5 {
+		return a
+	}
+	return b
+}
+
+// interpString needs more consideration and testing. It MUST be consistent with
+// lexicographic comparison a la strings.Compare (e.g. "x" > "abc"), so we can't
+// just interpolate as bytes, which takes numerics semantics. For now we
+// right-pad to make them the same length, then interpolate each character.
+func interpString(f float64, a, b string) string {
+	if f < 0 || f > 1 {
+		panic("f out of range")
+	}
+	if a > b {
+		panic("a > b")
+	}
+	// Ensure both strings are the same length by padding with \0
+	maxLen := len(a)
+	if len(b) > maxLen {
+		maxLen = len(b)
+	}
+
+	a = padString(a, maxLen)
+	b = padString(b, maxLen)
+
+	result := make([]byte, maxLen)
+	// Interpolate each character, wonky but remains legible
+	for i := range result {
+		charA, charB := a[i], b[i]
+		diff := float64(charB) - float64(charA)
+		result[i] = charA + byte(math.Round(f*diff))
+	}
+
+	// Convert the byte slice back to a string and trim any null characters
+	return strings.TrimRight(string(result), "\x00")
+}
+
+func padString(s string, length int) string {
+	if len(s) < length {
+		return s + strings.Repeat("\x00", length-len(s))
+	}
+	return s
+}
+
+// makeBounds computes n+1 evenly spaced histogram bounds given the range [a,b].
+// However, if two bounds would be equal, as is possible when T is an integer
+// and n is too small, there will be fewer bounds.
+func makeBounds[T any](n int, a, b T, comp func(a, b T) int, interp func(f float64, a, b T) T) []T {
+	if comp(a, b) == 1 {
+		panic("no good")
+	}
+	bounds := make([]T, 0, n+1)
+	f := 1 / float64(n)
+	for i := 0; i <= n; i++ {
+		next := interp(f*float64(i), a, b)
+		if i > 0 && comp(next, bounds[len(bounds)-1]) == 0 {
+			continue // trying to over-subdivide, easy with integers
+		}
+		bounds = append(bounds, next)
+	}
+	return bounds
+}
+
+func makeHisto[T any](bounds []T, comp func(a, b T) int) histo[T] {
+	return histo[T]{
+		bounds: bounds,
+		freqs:  make([]int, len(bounds)+1),
+		comp:   comp,
+	}
+}
+
+// interpNumRat provides a possibly more accurate approach, particularly when T
+// is floating point. May remove.
+func interpNumRat[T Num](f *big.Rat, a, b T) T {
+	return T(interpNumRatF(f, a, b))
+}
+
+func interpNumRatF[T Num](f *big.Rat, a, b T) float64 {
+	ra, _ := new(big.Float).SetPrec(big.MaxPrec).SetFloat64(float64(a)).Rat(nil)
+	rb, _ := new(big.Float).SetPrec(big.MaxPrec).SetFloat64(float64(b)).Rat(nil)
+	// ra := new(big.Rat).SetFloat64(float64(a))
+	// rb := new(big.Rat).SetFloat64(float64(b))
+
+	// a + f*(b-a)
+	rbma := new(big.Rat).Sub(rb, ra)
+	fab := new(big.Rat).Mul(f, rbma)
+	res, exact := new(big.Rat).Add(ra, fab).Float64()
+
+	if exact {
+		return res
+	}
+
+	// just go with all float
+	ff, _ := f.Float64()
+	return interpNumF(ff, a, b)
+
+	/*different groupings that may yield better results:
+	// f*b + (1-f)*a
+
+	// 1-f
+	mf := big.NewRat(1, 1)
+	mf.Sub(mf, f)
+	// (1-r) * a
+	mf.Mul(mf, ra)
+
+	// f*b
+	fb := new(big.Rat).Mul(f, rb)
+	// f*b + (1-r) * a
+	res, exact = fb.Add(fb, mf).Float64()
+	//fmt.Println(exact)
+
+	if exact { return res }
+
+	// f*b + a - f*a
+	fb = new(big.Rat).Mul(f, rb)
+	fb.Add(fb, ra)
+	fa := new(big.Rat).Mul(f, ra)
+	res, exact = fb.Sub(fb, fa).Float64()
+	//fmt.Println(exact)
+
+	return res
+	*/
+}
+
+func makeBoundsNum[T Num](n int, a, b T) []T {
+	if b <= a {
+		panic("no good")
+	}
+	bounds := make([]T, 0, n+1)
+	// f := 1 / float64(n)
+	for i := 0; i <= n; i++ {
+		fi := big.NewRat(int64(i), int64(n)) // 1/n
+		next := interpNumRat(fi, a, b)
+		// next := interpNum(f*float64(i), a, b)
+		if i > 0 && next == bounds[len(bounds)-1] {
+			continue // trying to over-subdivide, easy with integers
+		}
+		bounds = append(bounds, next)
+	}
+	return bounds
+}
diff --git a/internal/sql/pg/histo_test.go b/internal/sql/pg/histo_test.go
new file mode 100644
index 000000000..bb1553fba
--- /dev/null
+++ b/internal/sql/pg/histo_test.go
@@ -0,0 +1,538 @@
+package pg
+
+import (
+	"bytes"
+	"cmp"
+	"math"
+	"math/big"
+	"math/rand"
+	"reflect"
+	"strings"
+	"testing"
+)
+
+func Test_histo_explicit_int(t *testing.T) {
+	comp := cmp.Compare[int]
+	interp := interpNum[int]
+	bounds := makeBounds(10, int(-100), 800, comp, interp)
+	hInt := makeHisto(bounds, comp)
+	for i := 0; i < 1000; i++ {
+		hInt.ins(rand.Intn(1200) - 200)
+	}
+
+	t.Log(hInt)
+}
+
+func Test_histo_num(t *testing.T) {
+	bounds := makeBoundsNum[int](10, -100, 800)
+	hInt := makeHisto(bounds, cmp.Compare[int])
+	for i := 0; i < 1000; i++ {
+		hInt.ins(rand.Intn(1200) - 200)
+	}
+
+	t.Log(hInt)
+}
+
+func Test_histo_float(t *testing.T) {
+	bounds := makeBoundsNum[float64](10, -100, 800)
+	hInt := makeHisto(bounds, cmp.Compare[float64])
+	for i := 0; i < 1000; i++ {
+		hInt.ins(float64(rand.Intn(1200) - 200))
+	}
+	// Without big.Rat impl:
+	// bounds = [-100 -10 80 170.00000000000003 260 350 440.00000000000006 530 620 710 800]
+
+	t.Log(hInt)
+}
+
+func Test_interpNumF(t *testing.T) {
+	tests := []struct {
+		name     string
+		f        float64
+		a        int
+		b        int
+		expected float64
+	}{
+		{"Zero interpolation", 0, 10, 20, 10},
+		{"Full interpolation", 1, 10, 20, 20},
+		{"Mid interpolation", 0.5, 10, 20, 15},
+		{"Quarter interpolation", 0.25, 10, 20, 12.5},
+		{"Three-quarter interpolation", 0.75, 10, 20, 17.5},
+		{"Negative numbers", 0.5, -10, 10, 0},
+		{"Same numbers", 0.5, 5, 5, 5},
+		{"Large numbers", 0.5, 1000000, 2000000, 1500000},
+		{"Small fractional numbers", 0.1, 1, 2, 1.1},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := interpNumF(tt.f, tt.a, tt.b)
+			if result != tt.expected {
+				t.Errorf("interpNumF(%v, %v, %v) = %v, want %v", tt.f, tt.a, tt.b, result, tt.expected)
+			}
+		})
+	}
+}
+
+func Test_interpNumF_float64(t *testing.T) {
+	tests := []struct {
+		name     string
+		f        float64
+		a        float64
+		b        float64
+		expected float64
+	}{
+		{"Fractional interpolation", 0.3, 1.5, 3.5, 2.1},
+		{"Negative fractional interpolation", 0.7, -2.5, -1.5, -1.8},
+		{"Zero to one interpolation", 0.5, 0, 1, 0.5},
+		{"Very small numbers", 0.5, 1e-10, 2e-10, 1.5e-10},
+		{"Very large numbers", 0.5, 1e10, 2e10, 1.5e10},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := interpNumF(tt.f, tt.a, tt.b)
+			if !almostEqual(result, tt.expected, 1e-9) {
+				t.Errorf("interpNumF(%v, %v, %v) = %v, want %v", tt.f, tt.a, tt.b, result, tt.expected)
+			}
+		})
+	}
+}
+
+func almostEqual(a, b, tolerance float64) bool {
+	return math.Abs(a-b) <= tolerance
+}
+
+func Test_interpNum(t *testing.T) {
+	tests := []struct {
+		name     string
+		f        float64
+		a        int
+		b        int
+		expected int
+	}{
+		{"Extreme interpolation near 1", 0.99, 0, 100, 99},
+		{"Extreme interpolation near 0", 0.01, 0, 100, 1},
+		{"Interpolation with negative and positive", 0.6, -50, 50, 10},
+		{"Interpolation with both negative", 0.4, -100, -50, -80},
+		{"Interpolation with large numbers", 0.75, 1000000, 2000000, 1750000},
+		{"Zero interpolation with large difference", 0, -1000000, 1000000, -1000000},
+		{"Full interpolation with large difference", 1, -1000000, 1000000, 1000000},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := interpNum(tt.f, tt.a, tt.b)
+			if result != tt.expected {
+				t.Errorf("interpNum(%v, %v, %v) = %v, want %v", tt.f, tt.a, tt.b, result, tt.expected)
+			}
+		})
+	}
+}
+
+func Test_interpNum_float64(t *testing.T) {
+	tests := []struct {
+		name     string
+		f        float64
+		a        float64
+		b        float64
+		expected float64
+	}{
+		{"Interpolation with small fractional difference", 0.3, 1.1, 1.2, 1.13},
+		{"Interpolation with large fractional difference", 0.7, 0.001, 0.1, 0.0703},
+		{"Interpolation with negative fractionals", 0.4, -0.5, -0.1, -0.34},
+		{"Extreme values near float64 limits", 0.5, -math.MaxFloat64 / 2, math.MaxFloat64 / 2, 0},
+		{"Very small positive numbers", 0.6, 1e-15, 1e-14, 6.4e-15},
+		{"Very small negative numbers", 0.8, -1e-14, -1e-15, -2.8e-15},
+		{"Numbers close to each other", 0.5, 1.00000001, 1.00000002, 1.000000015},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := interpNum(tt.f, tt.a, tt.b)
+			if !almostEqual(result, tt.expected, 1e-9) {
+				t.Errorf("interpNum(%v, %v, %v) = %v, want %v", tt.f, tt.a, tt.b, result, tt.expected)
+			}
+		})
+	}
+}
+
+func Test_makeBounds_int(t *testing.T) {
+	tests := []struct {
+		name     string
+		n        int
+		a        int
+		b        int
+		expected []int
+	}{
+		{
+			name:     "Five equal intervals",
+			n:        5,
+			a:        0,
+			b:        100,
+			expected: []int{0, 20, 40, 60, 80, 100},
+		},
+		{
+			name:     "two intervals (3 bounds) with negative num",
+			n:        2,
+			a:        -10,
+			b:        10,
+			expected: []int{-10, 0, 10},
+		},
+		{
+			name:     "single interval",
+			n:        1,
+			a:        5,
+			b:        10,
+			expected: []int{5, 10},
+		},
+		{
+			name:     "too small integer range",
+			n:        10,
+			a:        0,
+			b:        4,
+			expected: []int{0, 1, 2, 3, 4}, // integer forces fewer bounds
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := makeBoundsNum(tt.n, tt.a, tt.b)
+			if !reflect.DeepEqual(result, tt.expected) {
+				t.Errorf("makeBounds(%v, %v, %v, interpNum[int]) = %v, want %v", tt.n, tt.a, tt.b, result, tt.expected)
+			}
+		})
+	}
+}
+
+func Test_makeBounds_float64(t *testing.T) {
+	tests := []struct {
+		name     string
+		n        int
+		a        float64
+		b        float64
+		expected []float64
+	}{
+		{
+			name:     "Four intervals with fractional values",
+			n:        4,
+			a:        0.5,
+			b:        2.5,
+			expected: []float64{0.5, 1.0, 1.5, 2.0, 2.5},
+		},
+		{
+			name:     "Three intervals with very small numbers",
+			n:        3,
+			a:        1e-6,
+			b:        1e-5,
+			expected: []float64{1e-6, 4e-6, 7e-6, 1e-5},
+		},
+		{
+			name:     "four intervals with negative fractional values",
+			n:        4,
+			a:        -1.5,
+			b:        1.5,
+			expected: []float64{-1.5, -0.75, 0, 0.75, 1.5},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := makeBoundsNum(tt.n, tt.a, tt.b)
+			if !reflect.DeepEqual(result, tt.expected) {
+				t.Errorf("makeBounds(%v, %v, %v, interpNum[float64]) = %v, want %v", tt.n, tt.a, tt.b, result, tt.expected)
+			}
+		})
+	}
+}
+
+func Test_interpNum_edge_cases(t *testing.T) {
+	tests := []struct {
+		name     string
+		f        float64
+		a        int
+		b        int
+		expected int
+	}{
+		{"Interpolation with f = 0", 0, 10, 20, 10},
+		{"Interpolation with f = 1", 1, 10, 20, 20},
+		{"Interpolation with f > 1", 1.5, 10, 20, 25},
+		{"Interpolation with f < 0", -0.5, 10, 20, 5},
+		{"Interpolation with a = b", 0.5, 15, 15, 15},
+		{"Interpolation with very large numbers", 0.5, math.MaxInt32 / 2, math.MaxInt32, 1610612735},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := interpNum(tt.f, tt.a, tt.b)
+			if result != tt.expected {
+				t.Errorf("interpNum(%v, %v, %v) = %v, want %v", tt.f, tt.a, tt.b, result, tt.expected)
+			}
+		})
+	}
+}
+
+func Test_interpBig(t *testing.T) {
+	tests := []struct {
+		name     string
+		f        float64
+		a        *big.Int
+		b        *big.Int
+		expected *big.Int
+	}{
+		{
+			name:     "Simple interpolation",
+			f:        0.5,
+			a:        big.NewInt(100),
+			b:        big.NewInt(200),
+			expected: big.NewInt(150),
+		},
+		{
+			name:     "Zero interpolation",
+			f:        0,
+			a:        big.NewInt(1000),
+			b:        big.NewInt(2000),
+			expected: big.NewInt(1000),
+		},
+		{
+			name:     "Full interpolation",
+			f:        1,
+			a:        big.NewInt(500),
+			b:        big.NewInt(1500),
+			expected: big.NewInt(1500),
+		},
+		{
+			name:     "Interpolation with negative numbers",
+			f:        0.25,
+			a:        big.NewInt(-1000),
+			b:        big.NewInt(1000),
+			expected: big.NewInt(-500),
+		},
+		{
+			name: "Interpolation with large numbers",
+			f:    0.5,
+			a:    new(big.Int).Exp(big.NewInt(2), big.NewInt(100), nil),
+			b:    new(big.Int).Exp(big.NewInt(2), big.NewInt(101), nil),
+			expected: new(big.Int).Add(
+				new(big.Int).Exp(big.NewInt(2), big.NewInt(100), nil),
+				new(big.Int).Div(new(big.Int).Exp(big.NewInt(2), big.NewInt(100), nil), big.NewInt(2)),
+			),
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := interpBig(tt.f, tt.a, tt.b)
+			if result.Cmp(tt.expected) != 0 {
+				t.Errorf("interpBig(%v, %v, %v) = %v, want %v", tt.f, tt.a, tt.b, result, tt.expected)
+			}
+		})
+	}
+}
+
+func Test_interpBig_Panic(t *testing.T) {
+	defer func() {
+		if r := recover(); r == nil {
+			t.Errorf("The code did not panic")
+		}
+	}()
+
+	a := big.NewInt(100)
+	b := big.NewInt(50)
+	interpBig(0.5, a, b)
+}
+
+func Test_interpStr(t *testing.T) {
+	tests := []struct {
+		name     string
+		f        float64
+		a        string
+		b        string
+		expected string
+	}{
+		{
+			name:     "simple",
+			f:        0.5,
+			a:        "abc",
+			b:        "xyz",
+			expected: "mno",
+		},
+		{
+			name:     "zero f",
+			f:        0,
+			a:        "hello",
+			b:        "world",
+			expected: "hello",
+		},
+		{
+			name:     "very simple",
+			f:        0.5,
+			a:        "a",
+			b:        "c",
+			expected: "b",
+		},
+		{
+			name:     "full interp",
+			f:        1,
+			a:        "a",
+			b:        "c",
+			expected: "c",
+		},
+		{
+			name:     "Interpolation with empty strings",
+			f:        0.5,
+			a:        "",
+			b:        "test",
+			expected: ":3::",
+		},
+		{
+			name:     "Interpolation with unicode characters",
+			f:        0.5,
+			a:        "αβγ",
+			b:        "δεζ",
+			expected: "γδε",
+		},
+		// {
+		// 	name:     "Interpolation with f > 1",
+		// 	f:        1.5,
+		// 	a:        "abc",
+		// 	b:        "xyz",
+		// 	expected: "",
+		// },
+		// {
+		// 	name:     "Interpolation with f < 0",
+		// 	f:        -0.5,
+		// 	a:        "abc",
+		// 	b:        "xyz",
+		// 	expected: "UVW",
+		// },
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := interpString(tt.f, tt.a, tt.b)
+			if result != tt.expected {
+				t.Errorf("interpStr(%v, %q, %q) = %q, want %q", tt.f, tt.a, tt.b, result, tt.expected)
+			}
+
+			// t.Log(result)
+
+			if strings.Compare(tt.a, result) == 1 {
+				t.Errorf("%v not <= %v", tt.a, result)
+			}
+			if strings.Compare(result, tt.b) == 1 {
+				t.Errorf("%v not >) %v", tt.b, result)
+			}
+		})
+	}
+}
+
+func Test_interpBts(t *testing.T) {
+	tests := []struct {
+		name     string
+		f        float64
+		a        []byte
+		b        []byte
+		expected []byte
+	}{
+		{
+			name:     "Simple interpolation",
+			f:        0.5,
+			a:        []byte{0x00},
+			b:        []byte{0xFF},
+			expected: []byte{0x7F},
+		},
+		{
+			name:     "Zero interpolation",
+			f:        0,
+			a:        []byte{0x10, 0x20},
+			b:        []byte{0x30, 0x40},
+			expected: []byte{0x10, 0x20},
+		},
+		{
+			name:     "Full interpolation",
+			f:        1,
+			a:        []byte{0x00, 0x00},
+			b:        []byte{0xFF, 0xFF},
+			expected: []byte{0xFF, 0xFF},
+		},
+		{
+			name:     "Interpolation with different byte lengths",
+			f:        0.25,
+			a:        []byte{0x01},
+			b:        []byte{0x01, 0x00},
+			expected: []byte{0x40},
+		},
+		{
+			name:     "Interpolation with large numbers",
+			f:        0.75,
+			a:        []byte{0x00, 0x00, 0x00, 0x00},
+			b:        []byte{0xFF, 0xFF, 0xFF, 0xFF},
+			expected: []byte{0xBF, 0xFF, 0xFF, 0xFF},
+		},
+		{
+			name:     "Interpolation with empty byte slice",
+			f:        0.5,
+			a:        []byte{},
+			b:        []byte{0x2},
+			expected: []byte{0x1},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := interpBts(tt.f, tt.a, tt.b)
+			if !bytes.Equal(result, tt.expected) {
+				t.Errorf("interpBts(%v, %v, %v) = %v, want %v", tt.f, tt.a, tt.b, result, tt.expected)
+			}
+		})
+	}
+}
+
+func Test_interpBts_EdgeCases(t *testing.T) {
+	tests := []struct {
+		name string
+		f    float64
+		a    []byte
+		b    []byte
+		want []byte
+	}{
+		{
+			name: "Interpolation with f > 1",
+			f:    1.5,
+			a:    []byte{0x00},
+			b:    []byte{0xFF},
+			want: []byte{1, 126},
+		},
+		{
+			name: "Interpolation with f < 0",
+			f:    -0.5,
+			a:    []byte{0x00},
+			b:    []byte{0xFF},
+			want: []byte{127},
+		},
+		{
+			name: "Interpolation with very large byte slices",
+			f:    0.5,
+			a:    make([]byte, 1000),
+			b:    bytes.Repeat([]byte{0xFF}, 1000),
+			want: append([]byte{127}, bytes.Repeat([]byte{0xFF}, 999)...),
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := interpBts(tt.f, tt.a, tt.b)
+			if len(result) == 0 {
+				t.Errorf("interpBts(%v, %v, %v) returned empty slice", tt.f, tt.a, tt.b)
+			}
+			if bytes.Compare(result, tt.a) < 0 || bytes.Compare(result, tt.b) > 0 {
+				t.Errorf("interpBts(%v, %v, %v) = %v, which is out of range", tt.f, tt.a, tt.b, result)
+			}
+			// t.Log(result)
+			if !bytes.Equal(result, tt.want) {
+				t.Errorf("wanted %x, got %x", tt.want, result)
+			}
+		})
+	}
+}
diff --git a/internal/sql/pg/stats.go b/internal/sql/pg/stats.go
index d0a2f9e96..b079262c6 100644
--- a/internal/sql/pg/stats.go
+++ b/internal/sql/pg/stats.go
@@ -7,6 +7,7 @@ import (
 	"errors"
 	"fmt"
 	"slices"
+	"sort"
 	"strings"
 
 	"github.com/jackc/pgx/v5/pgtype"
@@ -16,9 +17,12 @@ import (
 	"github.com/kwilteam/kwil-db/core/types/decimal"
 )
 
-// RowCount gets a precise row count for the named fully qualified table. If the
-// Executor satisfies the RowCounter interface, that method will be used
-// directly. Otherwise a simple select query is used.
+// statsCap is the limit on the number of MCVs and histogram bins when working
+// with column statistics. Fixed for now, but we should consider making a stats
+// field or settable another way.
+const statsCap = 100
+
+// RowCount gets a precise row count for the named fully qualified table.
 func RowCount(ctx context.Context, qualifiedTable string, db sql.Executor) (int64, error) {
 	stmt := fmt.Sprintf(`SELECT count(1) FROM %s`, qualifiedTable)
 	res, err := db.Execute(ctx, stmt)
@@ -120,24 +124,75 @@ func colStats(ctx context.Context, qualifiedTable string, colInfo []ColInfo, db
 		pkCols[i] = row[0].(string)
 	}
 
-	numCols := len(colInfo)
-	colTypes := make([]ColType, numCols)
-	for i := range colInfo {
-		colTypes[i] = colInfo[i].Type()
+	stmt := `SELECT * FROM ` + qualifiedTable
+	if len(pkCols) > 0 {
+		stmt += ` ORDER BY ` + strings.Join(pkCols, ",")
+	}
+
+	colStats, err := colStatsInternal(ctx, nil, stmt, colInfo, db)
+	if err != nil {
+		return nil, err
+	}
+
+	// I sunk a bunch of time into a two-pass experiment to more accurate MCVs
+	// and better histogram bounds, but this is quite costly.  May remove.
+
+	return colStats, nil // single ASC pass
+	// second DESC pass:
+	// return colStatsInternal(ctx, colStats, stmt, colInfo, db)
+}
+
+func cmpBool(a, b bool) int {
+	if b {
+		if a { // true == true
+			return 0
+		}
+		return -1 // false < true
+	}
+	if a {
+		return 1 // true > false
+	}
+	return 0 // false == false
+}
+
+func cmpDecimal(val, mm *decimal.Decimal) int {
+	d, err := val.Cmp(mm)
+	if err != nil {
+		panic(fmt.Sprintf("%s: (nan decimal?) %v or %v", err, val, mm))
+	}
+	return d
+}
+
+func colStatsInternal(ctx context.Context, firstPass []sql.ColumnStatistics,
+	stmt string, colInfo []ColInfo, db sql.Executor) ([]sql.ColumnStatistics, error) {
+
+	// The idea I'm testing with a two-pass scan is to collect mcvs from a
+	// reverse-order scan. This is pointless if the MCVs is not at capacity as
+	// nothing new can be added.
+	if firstPass != nil {
+		stmt += ` DESC` // otherwise ASC is implied with ORDER BY
 	}
 
-	colStats := make([]sql.ColumnStatistics, numCols)
+	getLast := func(i int) *sql.ColumnStatistics {
+		if len(firstPass) != 0 {
+			return &firstPass[i]
+		}
+		return nil
+	}
+
+	// Iterate over all rows (select *), scan into NULLable values like
+	// pgtype.Int8, then make statistics with Go native or Kwil types.
 
-	// iterate over all rows (select *)
 	var scans []any
-	for _, col := range colInfo {
+	colTypes := make([]ColType, len(colInfo))
+	for i, col := range colInfo {
+		colTypes[i] = colInfo[i].Type()
 		scans = append(scans, col.scanVal())
 	}
-	stmt := `SELECT * FROM ` + qualifiedTable
-	if len(pkCols) > 0 {
-		stmt += ` ORDER BY ` + strings.Join(pkCols, ",")
-	}
-	err = QueryRowFunc(ctx, db, stmt, scans,
+
+	colStats := make([]sql.ColumnStatistics, len(colInfo))
+
+	err := QueryRowFunc(ctx, db, stmt, scans,
 		func() error {
 			var err error
 			for i, val := range scans {
@@ -172,7 +227,7 @@ func colStats(ctx context.Context, qualifiedTable string, colInfo []ColInfo, db
 						}
 					}
 
-					ins(stat, valInt, cmp.Compare[int64])
+					ins(stat, getLast(i), valInt, cmp.Compare[int64], interpNum)
 
 				case ColTypeText: // use string in stats
 					valStr, null, ok := TextValue(val) // val.(string)
@@ -184,7 +239,7 @@ func colStats(ctx context.Context, qualifiedTable string, colInfo []ColInfo, db
 						continue
 					}
 
-					ins(stat, valStr, strings.Compare)
+					ins(stat, getLast(i), valStr, strings.Compare, interpString)
 
 				case ColTypeByteA: // use []byte in stats
 					var valBytea []byte
@@ -220,7 +275,7 @@ func colStats(ctx context.Context, qualifiedTable string, colInfo []ColInfo, db
 						return fmt.Errorf("not bytea: %T", val)
 					}
 
-					ins(stat, valBytea, bytes.Compare)
+					ins(stat, getLast(i), valBytea, bytes.Compare, interpBts)
 
 				case ColTypeBool: // use bool in stats
 					var b bool
@@ -246,7 +301,7 @@ func colStats(ctx context.Context, qualifiedTable string, colInfo []ColInfo, db
 						return fmt.Errorf("invalid bool (%T)", val)
 					}
 
-					ins(stat, b, cmpBool)
+					ins(stat, getLast(i), b, cmpBool, interpBool) // there should never be a boolean histogram!
 
 				case ColTypeNumeric: // use *decimal.Decimal in stats
 					var dec *decimal.Decimal
@@ -298,7 +353,7 @@ func colStats(ctx context.Context, qualifiedTable string, colInfo []ColInfo, db
 						dec = &v2
 					}
 
-					ins(stat, dec, cmpDecimal)
+					ins(stat, getLast(i), dec, cmpDecimal, interpDec)
 
 				case ColTypeUINT256:
 					v, ok := val.(*types.Uint256)
@@ -311,7 +366,7 @@ func colStats(ctx context.Context, qualifiedTable string, colInfo []ColInfo, db
 						continue
 					}
 
-					ins(stat, v.Clone(), types.CmpUint256)
+					ins(stat, getLast(i), v.Clone(), types.CmpUint256, interpUint256)
 
 				case ColTypeFloat: // we don't want, don't have
 					var varFloat float64
@@ -353,7 +408,7 @@ func colStats(ctx context.Context, qualifiedTable string, colInfo []ColInfo, db
 						return fmt.Errorf("invalid float (%T)", val)
 					}
 
-					ins(stat, varFloat, cmp.Compare[float64])
+					ins(stat, getLast(i), varFloat, cmp.Compare[float64], interpNum)
 
 				case ColTypeUUID:
 					fallthrough // TODO
@@ -369,35 +424,324 @@ func colStats(ctx context.Context, qualifiedTable string, colInfo []ColInfo, db
 		return nil, err
 	}
 
+	// If this is a second pass, merge. See other comments about the two-pass
+	// approach. It is costly, complex, and hard to quantify benefit. Likely to remove!
+	for i, p := range firstPass {
+		if len(colStats[i].MCFreqs) == 0 { // nothing new was recorded for this column
+			colStats[i].MCFreqs = p.MCFreqs
+			colStats[i].MCVals = p.MCVals
+			continue
+		}
+
+		// merge up the last mcvs.  This is horribly inefficient for now:
+		// concat, build slice of sortable struct, sort by frequency, extract
+		// up to statsCap, re-sort by value.
+		freqs := append(colStats[i].MCFreqs, p.MCFreqs...)
+		vals := append(colStats[i].MCVals, p.MCVals...)
+
+		type mcv struct {
+			freq int
+			val  any
+		}
+		mcvs := make([]mcv, len(freqs))
+		for i := range freqs {
+			mcvs[i] = mcv{freqs[i], vals[i]}
+		}
+		if len(mcvs) > statsCap { // drop the lowest frequency values
+			slices.SortFunc(mcvs, func(a, b mcv) int {
+				return cmp.Compare(b.freq, a.freq) // descending freq
+			})
+			mcvs = mcvs[:statsCap] // mcvs = slices.Delete(mcvs, statsCap, len(mcvs))
+		}
+
+		valCompFun := compFun(vals[0]) // based on prototype value
+		slices.SortFunc(mcvs, func(a, b mcv) int {
+			return valCompFun(b.val, a.val) // ascending value
+		})
+
+		// extract the values and frequencies slices
+		freqs, vals = nil, nil // clear and preserve type
+		for i := range mcvs {
+			freqs = append(freqs, mcvs[i].freq)
+			vals = append(vals, mcvs[i].val)
+			if i == statsCap {
+				break
+			}
+		}
+
+		/* ALT in-place with sort.Interface
+		if len(mcvs) > statsCap { // drop the lowest frequency values
+			sort.Sort(mcvDescendingFreq{
+				vals:  vals,
+				freqs: freqs,
+			})
+			vals = vals[:statsCap]
+			freqs = freqs[:statsCap]
+		}
+		sort.Sort(mcvAscendingValue{
+			vals:  vals,
+			freqs: freqs,
+			comp:  compFun(vals[0]),
+		}) */
+		colStats[i].MCFreqs = freqs
+		colStats[i].MCVals = vals
+	}
+
 	return colStats, nil
 }
 
-func cmpBool(a, b bool) int {
-	if b {
-		if a { // true == true
-			return 0
+/* xx will remove if we decide a two-pass scan isn't worth it
+type mcvDescendingFreq struct {
+	vals  []any
+	freqs []int
+}
+
+func (m mcvDescendingFreq) Len() int { return len(m.vals) }
+
+func (m mcvDescendingFreq) Less(i int, j int) bool {
+	return m.freqs[i] < m.freqs[j]
+}
+
+func (m mcvDescendingFreq) Swap(i int, j int) {
+	m.vals[i], m.vals[j] = m.vals[j], m.vals[i]
+	m.freqs[i], m.freqs[j] = m.freqs[j], m.freqs[i]
+}
+
+type mcvAscendingValue struct {
+	vals  []any
+	freqs []int
+	comp  func(a, b any) int
+}
+
+func (m mcvAscendingValue) Len() int { return len(m.vals) }
+
+func (m mcvAscendingValue) Less(i int, j int) bool {
+	return m.comp(m.vals[i], m.vals[j]) == 0
+}
+
+func (m mcvAscendingValue) Swap(i int, j int) {
+	m.vals[i], m.vals[j] = m.vals[j], m.vals[i]
+	m.freqs[i], m.freqs[j] = m.freqs[j], m.freqs[i]
+}
+*/
+
+// the pain of []any vs. []T (in an any)
+func wrapCompFun[T any](f func(a, b T) int) func(a, b any) int {
+	return func(a, b any) int { // must not be nil
+		return f(a.(T), b.(T))
+	}
+}
+
+// vs func compFun[T any]() func(a, b T) int
+func compFun(val any) func(a, b any) int {
+	switch val.(type) {
+	case []byte:
+		return wrapCompFun(bytes.Compare)
+	case int64:
+		return wrapCompFun(cmp.Compare[int64])
+	case float64:
+		return wrapCompFun(cmp.Compare[float64])
+	case string:
+		return wrapCompFun(strings.Compare)
+	case bool:
+		return wrapCompFun(cmpBool)
+	case *decimal.Decimal:
+		return wrapCompFun(cmpDecimal)
+	case *types.Uint256:
+		return wrapCompFun(types.CmpUint256)
+	case types.UUID:
+		return wrapCompFun(types.CmpUint256)
+
+	case decimal.DecimalArray: // TODO
+	case types.Uint256Array: // TODO
+	case []string:
+	case []int64:
+	}
+
+	panic(fmt.Sprintf("no comp fun for type %T", val))
+}
+
+// The following functions perform a type switch to correctly handle null values
+// and then dispatch to the generic ins/up functions with the appropriate
+// comparison function for the underlying type: upColStatsWithInsert,
+// upColStatsWithDelete, and upColStatsWithUpdate.
+
+// upColStatsWithInsert expects a value of the type created with a
+// (*datatype).DeserializeChangeset method.
+func upColStatsWithInsert(stats *sql.ColumnStatistics, val any) error {
+	if val == nil {
+		stats.NullCount++
+		return nil
+	}
+	// INSERT
+	switch nt := val.(type) {
+	case []byte:
+		return ins(stats, nil, nt, bytes.Compare, interpBts)
+	case int64:
+		return ins(stats, nil, nt, cmp.Compare[int64], interpNum[int64])
+	case float64:
+		return ins(stats, nil, nt, cmp.Compare[float64], interpNum[float64])
+	case string:
+		return ins(stats, nil, nt, strings.Compare, interpString)
+	case bool:
+		return ins(stats, nil, nt, cmpBool, interpBool)
+
+	case *decimal.Decimal:
+		if nt.NaN() {
+			stats.NullCount++
+			return nil // ignore, don't put in stats
 		}
-		return -1 // false < true
+		nt2 := *nt
+		return ins(stats, nil, &nt2, cmpDecimal, interpDec)
+
+	case *types.Uint256:
+		if nt.Null {
+			stats.NullCount++
+			return nil
+		}
+
+		return ins(stats, nil, nt.Clone(), types.CmpUint256, interpUint256)
+
+	case types.UUID: // TODO
+
+		return ins(stats, nil, nt, types.CmpUUID, interpUUID)
+
+	case decimal.DecimalArray: // TODO
+	case types.Uint256Array: // TODO
+	case []string:
+	case []int64:
+
+	default:
+		return fmt.Errorf("unrecognized tuple column type %T", val)
 	}
-	if a {
-		return 1 // true > false
+
+	fmt.Printf("unhandled %T", val)
+
+	return nil // known type, just no stats handling
+}
+
+func upColStatsWithDelete(stats *sql.ColumnStatistics, old any) error {
+	// DELETE:
+	// - unset min max if removing it. reference mincount and maxcount to know
+	// - update null count
+	// - adjust mcvs / histogram
+
+	if old == nil {
+		stats.NullCount--
+		return nil
 	}
-	return 0 // false == false
+
+	switch nt := old.(type) {
+	case int64:
+		del(stats, nt, cmp.Compare[int64])
+	case string:
+		del(stats, nt, strings.Compare)
+	case float64:
+		del(stats, nt, cmp.Compare[float64])
+	case bool:
+		del(stats, nt, cmpBool)
+	case []byte:
+		del(stats, nt, bytes.Compare)
+
+	case *decimal.Decimal:
+		if nt.NaN() {
+			stats.NullCount--
+			return nil // ignore, don't put in stats
+		}
+		nt2 := *nt
+
+		del(stats, &nt2, cmpDecimal)
+
+	case *types.Uint256:
+		if nt.Null {
+			stats.NullCount--
+			return nil
+		}
+
+		return del(stats, nt.Clone(), types.CmpUint256)
+
+	case types.UUID:
+
+		return del(stats, nt, types.CmpUUID)
+
+	case decimal.DecimalArray: // TODO
+	case types.Uint256Array: // TODO
+	case []string:
+	case []int64:
+
+	default:
+		return fmt.Errorf("unrecognized tuple column type %T", old)
+	}
+
+	return nil
 }
 
-func cmpDecimal(val, mm *decimal.Decimal) int {
-	d, err := val.Cmp(mm)
+func upColStatsWithUpdate(stats *sql.ColumnStatistics, old, up any) error { //nolint:unused
+	// update may or may not affect null count
+	err := upColStatsWithDelete(stats, old)
 	if err != nil {
-		panic(fmt.Sprintf("%s: (nan decimal?) %v or %v", err, val, mm))
+		return err
 	}
-	return d
+	return upColStatsWithInsert(stats, up)
+}
+
+// The following are the generic functions that handle the re-typed and non-NULL
+// values from the upColStatsWith* functions above.
+
+// insMCVs attempts to insert a new value into the MCV set. If the set already
+// includes the value, its frequency is incremented. If the value is new and the
+// set is not yet at capacity, it is inserted at the appropriate location in the
+// slices to keep them sorted by value The sorting is needed later when
+// computing cumulative frequency with inequality conditions, and it allows
+// locating known values in log time.
+func insMCVs[T any](vals []any, freqs []int, val T, comp func(a, b T) int) ([]any, bool, []int) {
+	var spill bool
+	// sort.Search is much harder to use than slices.BinarySearchFunc but here we are
+	loc := sort.Search(len(vals), func(i int) bool {
+		v := vals[i].(T)
+		return comp(v, val) != -1 // v[i] >= val
+	})
+	found := loc != len(vals) && comp(vals[loc].(T), val) == 0
+	if found {
+		if comp(vals[loc].(T), val) != 0 {
+			panic("wrong loc")
+		}
+		freqs[loc]++
+	} else if len(vals) < statsCap {
+		vals = slices.Insert(vals, loc, any(val))
+		freqs = slices.Insert(freqs, loc, 1)
+	} else {
+		spill = true
+	}
+	return vals, spill, freqs
 }
 
-func ins[T any](stats *sql.ColumnStatistics, val T, comp func(v, m T) int) error {
-	if stats.Min == nil {
+// this is SOOO much easier with vals as a []T rather than []any.
+// alas, that created pains in other places... might switch back.
+
+/*func insMCVs[T any](vals []T, freqs []int, val T, comp func(a, b T) int) ([]T, []int) {
+	loc, found := slices.BinarySearchFunc(vals, val, comp)
+	if found {
+		freqs[loc]++
+	} else if len(vals) < statsCap {
+		vals = slices.Insert(vals, loc, val)
+		freqs = slices.Insert(freqs, loc, 1)
+	}
+	return vals, freqs
+}*/
+
+// ins is used to insert a non-NULL value, but it can be used in different contexts:
+// (1) performing a full scan, where mcvSpill may be non-nil on a second pass,
+// (2) maintaining stats estimate on an insert. del and update only apply in the
+// latter context.
+func ins[T any](stats, prev *sql.ColumnStatistics, val T, comp func(v, m T) int,
+	interp func(f float64, a, b T) T) error {
+
+	switch mn := stats.Min.(type) {
+	case nil: // first observation
 		stats.Min = val
 		stats.MinCount = 1
-	} else if mn, ok := stats.Min.(T); ok {
+	case T:
 		switch comp(val, mn) {
 		case -1: // new MINimum
 			stats.Min = val
@@ -405,24 +749,143 @@ func ins[T any](stats *sql.ColumnStatistics, val T, comp func(v, m T) int) error
 		case 0: // another of the same
 			stats.MinCount++
 		}
-	} else {
+	case unknown: // it was deleted, only full (re)scan can figure it out
+	default:
 		return fmt.Errorf("invalid stats value type %T for tuple of type %T", val, stats.Min)
 	}
 
-	if stats.Max == nil {
+	switch mn := stats.Max.(type) {
+	case nil: // first observation (also would have set Min above)
 		stats.Max = val
 		stats.MaxCount = 1
-	} else if mx, ok := stats.Max.(T); ok {
-		switch comp(val, mx) {
+	case T:
+		switch comp(val, mn) {
 		case 1: // new MAXimum
 			stats.Max = val
 			stats.MaxCount = 1
 		case 0: // another of the same
 			stats.MaxCount++
 		}
-	} else {
+	case unknown: // it was deleted, only full (re)scan can figure it out
+	default:
+		return fmt.Errorf("invalid stats value type %T for tuple of type %T", val, stats.Min)
+	}
+
+	if stats.MCVals == nil {
+		stats.MCVals = []any{} // insMCVs: = []T{val} ; stats.MCFreqs = []int{1}
+	}
+
+	var missed bool
+	if prev == nil || len(prev.MCVals) < statsCap { // first pass of full scan, pointless second pass, or no-context insert
+		stats.MCVals, missed, stats.MCFreqs = insMCVs(stats.MCVals, stats.MCFreqs, val, comp)
+		// vals, freqs, _ := insMCVs(convSlice[T](stats.MCVals), stats.MCFreqs, val, comp)
+		// stats.MCVals, stats.MCFreqs = convSlice2(vals), freqs
+	} else { // a second pass in a full scan
+		// The freqs for MCVals are complete, representing the entire table.
+		// Fill the spills struct instead, but EXCLUDING vals in MCVals,
+		// allowing possibly higher counts in later rows. Caller should merge
+		// back to the MCVals/MCFreqs after the complete second pass.
+
+		// When MCVals was an any (underlying []T) rather than a []any, we were
+		// able to simply assert to []T, but I switched to []any for other reasons.
+		// _, found := slices.BinarySearchFunc(stats.MCVals.([]T), val, comp)
+
+		loc := sort.Search(len(prev.MCVals), func(i int) bool {
+			v := prev.MCVals[i].(T)
+			return comp(v, val) != -1 // v[i] >= val
+		})
+		found := loc != len(prev.MCVals) && comp(prev.MCVals[loc].(T), val) == 0
+		// _, found := slices.BinarySearchFunc(convSlice[T](prev.MCVals), val, comp)
+		if !found { // not in previous scan
+			stats.MCVals, missed, stats.MCFreqs = insMCVs(stats.MCVals, stats.MCFreqs, val, comp)
+		} // else ignore this value, already counted in prev pass
+	}
+
+	// If the value was not included in the MCVs, it spills into the histogram.
+	if missed {
+		// we create the histogram only *after* MCVs have been collected so that
+		// the bounds can be chosen based on at least some observed values.
+		if stats.Histogram == nil {
+			var left, right T
+			if mn, ok := stats.Min.(T); ok {
+				left = mn
+			} else {
+				left = stats.MCVals[0].(T)
+			}
+			if mn, ok := stats.Max.(T); ok {
+				right = mn
+			} else {
+				right = stats.MCVals[len(stats.MCVals)-1].(T)
+			}
+			bounds := makeBounds(statsCap, left, right, comp, interp)
+			stats.Histogram = makeHisto(bounds, comp)
+		}
+
+		h := stats.Histogram.(histo[T])
+		h.ins(val)
+	}
+
+	return nil
+}
+
+// If we've eliminated all of a value via delete, then the real Min/Max is
+// unknown, and it just cannot be used to affect selectivity. Further inserts
+// also cannot be compared with an unknown min/max. Rescan is needed to identify
+// the actual value. This type signals this case.
+type unknown struct{}
+
+// del is used to delete a non-NULL value.
+func del[T any](stats *sql.ColumnStatistics, val T, comp func(v, m T) int) error {
+
+	switch mn := stats.Min.(type) {
+	case nil: // should not happen
+		return errors.New("nil Min on del")
+	case T:
+		if comp(val, mn) == 0 {
+			stats.MinCount--
+			if stats.MinCount == 0 {
+				stats.Min = unknown{}
+			}
+		}
+	case unknown: // it was deleted, only full (re)scan can figure it out
+	default:
+		return fmt.Errorf("invalid stats value type %T for tuple of type %T", val, stats.Min)
+	}
+
+	switch mn := stats.Max.(type) {
+	case nil: // should not happen
+		return errors.New("nil Max on del")
+	case T:
+		if comp(val, mn) == 0 {
+			stats.MaxCount--
+			if stats.MaxCount == 0 {
+				stats.Max = unknown{}
+			}
+		}
+	case unknown: // it was deleted, only full (re)scan can figure it out
+	default:
 		return fmt.Errorf("invalid stats value type %T for tuple of type %T", val, stats.Max)
 	}
 
+	// Look for it in the MCVs
+	loc := sort.Search(len(stats.MCVals), func(i int) bool {
+		v := stats.MCVals[i].(T)
+		return comp(v, val) != -1 // v[i] >= val
+	})
+	found := loc != len(stats.MCVals) && comp(stats.MCVals[loc].(T), val) == 0
+	// loc, found := slices.BinarySearchFunc(convSlice[T](stats.MCVals), val, comp)
+	if found {
+		if stats.MCFreqs[loc] == 1 {
+			stats.MCVals = slices.Delete(stats.MCVals, loc, loc+1)
+			stats.MCFreqs = slices.Delete(stats.MCFreqs, loc, loc+1)
+		} else {
+			stats.MCFreqs[loc]--
+		}
+	} else {
+		// adjust histogram freq--
+		hist := stats.Histogram.(histo[T])
+		hist.rm(val)
+	}
+
 	return nil
 }
diff --git a/internal/sql/pg/stats_enc.go b/internal/sql/pg/stats_enc.go
new file mode 100644
index 000000000..8b7757365
--- /dev/null
+++ b/internal/sql/pg/stats_enc.go
@@ -0,0 +1,79 @@
+package pg
+
+import (
+	"encoding/gob"
+	"errors"
+	"io"
+	"reflect"
+
+	"github.com/kwilteam/kwil-db/common/sql"
+)
+
+// EncStats encodes a set of table statistics to an io.Writer.
+func EncStats(w io.Writer, statsGroup map[sql.TableRef]*sql.Statistics) error {
+	enc := gob.NewEncoder(w)
+	// TableRef1,Statistics1,TableRef2,Statistics2,...
+	for tblRef, stats := range statsGroup {
+		err := enc.Encode(tblRef)
+		if err != nil {
+			return err
+		}
+		err = enc.Encode(stats)
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// DecStats decodes a set of table statistics from an io.Reader. Any type used
+// must be registered with the gob package.
+func DecStats(r io.Reader) (map[sql.TableRef]*sql.Statistics, error) {
+	dec := gob.NewDecoder(r)
+
+	// Read until EOF
+	out := map[sql.TableRef]*sql.Statistics{}
+	for {
+		var tblRef sql.TableRef
+		if err := dec.Decode(&tblRef); err != nil {
+			if errors.Is(err, io.EOF) {
+				break // return out, nil
+			}
+			return nil, err
+		}
+
+		stats := new(sql.Statistics)
+		if err := dec.Decode(&stats); err != nil {
+			return nil, err
+		}
+
+		if stats.RowCount > 0 {
+			// gob will leave empty slice as nil, which is wrong for most fields
+			for i := range stats.ColumnStatistics {
+				cs := &stats.ColumnStatistics[i]
+				cs.Min = nilToEmptySlice(cs.Min)
+				cs.Max = nilToEmptySlice(cs.Max)
+				for i := range cs.MCVals {
+					cs.MCVals[i] = nilToEmptySlice(cs.MCVals[i])
+				}
+			}
+		}
+
+		out[tblRef] = stats
+	}
+
+	return out, nil
+}
+
+func nilToEmptySlice(s any) any {
+	rt := reflect.TypeOf(s)
+	if rt == nil {
+		return nil
+	}
+	if rt.Kind() == reflect.Slice &&
+		reflect.ValueOf(s).IsNil() {
+		st := reflect.SliceOf(rt.Elem())
+		return reflect.MakeSlice(st, 0, 0).Interface()
+	}
+	return s
+}
diff --git a/internal/sql/pg/stats_enc_test.go b/internal/sql/pg/stats_enc_test.go
new file mode 100644
index 000000000..7f525eb00
--- /dev/null
+++ b/internal/sql/pg/stats_enc_test.go
@@ -0,0 +1,143 @@
+package pg
+
+import (
+	"bytes"
+	"reflect"
+	"testing"
+
+	"github.com/kwilteam/kwil-db/common/sql"
+)
+
+func TestStatsEncoding(t *testing.T) {
+	var buf bytes.Buffer
+
+	// tables slice so the test loop is ordered
+	var tblRefs []sql.TableRef
+	tblRefs = append(tblRefs, sql.TableRef{ // native Go type, int64
+		Namespace: "asdf",
+		Table:     "int64",
+	})
+	tblRefs = append(tblRefs, sql.TableRef{ // Kwil type Decimal
+		Namespace: "asdf",
+		Table:     "decimal",
+	})
+	tblRefs = append(tblRefs, sql.TableRef{ // Kwil type Uint256
+		Namespace: "asdf",
+		Table:     "uint256",
+	})
+	tblRefs = append(tblRefs, sql.TableRef{ // Kwil type UUID
+		Namespace: "nnnn",
+		Table:     "uuid",
+	})
+	tblRefs = append(tblRefs, sql.TableRef{ // byte slice
+		Namespace: "asdf",
+		Table:     "[]byte",
+	})
+
+	in := map[sql.TableRef]*sql.Statistics{
+		tblRefs[0]: {
+			RowCount: 1234,
+			ColumnStatistics: []sql.ColumnStatistics{
+				{
+					NullCount: 42,
+					Min:       int64(1),
+					MinCount:  2,
+					Max:       int64(6),
+					MaxCount:  1,
+					MCVals:    []any{int64(1), int64(1), int64(4), int64(5), int64(6)},
+					MCFreqs:   []int{2, 9, 3, 1, 1},
+				},
+			},
+		},
+		tblRefs[1]: {
+			RowCount: 1234,
+			ColumnStatistics: []sql.ColumnStatistics{
+				{
+					NullCount: 42,
+					Min:       mustDecimal("1"),
+					MinCount:  2,
+					Max:       mustDecimal("6"),
+					MaxCount:  1,
+					MCVals:    []any{mustDecimal("1"), mustDecimal("1"), mustDecimal("4"), mustDecimal("5"), mustDecimal("6")},
+					MCFreqs:   []int{2, 9, 3, 1, 1},
+				},
+			},
+		},
+		tblRefs[2]: {
+			RowCount: 1234,
+			ColumnStatistics: []sql.ColumnStatistics{
+				{
+					NullCount: 42,
+					Min:       mustUint256("1"),
+					MinCount:  2,
+					Max:       mustUint256("6"),
+					MaxCount:  1,
+					MCVals:    []any{mustUint256("1"), mustUint256("1"), mustUint256("4"), mustUint256("5"), mustUint256("6")},
+					MCFreqs:   []int{2, 9, 3, 1, 1},
+				},
+			},
+		},
+		tblRefs[3]: { // uuid
+			RowCount: 1645,
+			ColumnStatistics: []sql.ColumnStatistics{
+				{
+					NullCount: 8,
+					Min:       mustParseUUID("0000857c-8671-4f4e-99bd-fcc621f9d3d1"),
+					MinCount:  6,
+					Max:       mustParseUUID("9000857c-8671-4f4e-99bd-fcc621f9d3d1"),
+					MaxCount:  789,
+					MCVals: []any{mustParseUUID("0000857c-8671-4f4e-99bd-fcc621f9d3d1"),
+						mustParseUUID("1000857c-8671-4f4e-99bd-fcc621f9d3d1"),
+						mustParseUUID("2000857c-8671-4f4e-99bd-fcc621f9d3d1"),
+						mustParseUUID("3000857c-8671-4f4e-99bd-fcc621f9d3d1"),
+						mustParseUUID("9000857c-8671-4f4e-99bd-fcc621f9d3d1"),
+					},
+					MCFreqs: []int{6, 9, 3, 1, 789},
+				},
+			},
+		},
+		tblRefs[4]: { // []byte
+			RowCount: 88,
+			ColumnStatistics: []sql.ColumnStatistics{
+				{
+					NullCount: 42,
+					Min:       []byte{}, // important distinction with non-nil empty slice
+					MinCount:  2,
+					Max:       []byte{0xff, 0xff, 0xff},
+					MaxCount:  1,
+					MCVals:    []any{[]byte{0}, []byte{1}, []byte{2}, []byte{0xff, 0xff, 0xff}},
+					MCFreqs:   []int{2, 9, 3, 1},
+				},
+			},
+		},
+	}
+
+	err := EncStats(&buf, in)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	bts := buf.Bytes() // t.Logf("encoding length = %d", len(bts))
+
+	rd := bytes.NewReader(bts)
+	out, err := DecStats(rd)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(out) != len(in) {
+		t.Fatal("maps length not the same")
+	}
+
+	for tblRef, stat := range in {
+		outStat, have := out[tblRef]
+		if !have {
+			t.Fatalf("output stats lack table %v", tblRef)
+		}
+		// require.Equal(t, stat, outStat)
+		if !reflect.DeepEqual(stat, outStat) {
+			t.Fatalf("%v: output stats (%v) != input stats (%v)",
+				tblRef, stat, outStat)
+		}
+	}
+}
diff --git a/internal/sql/pg/stats_test.go b/internal/sql/pg/stats_test.go
index 7dd64f693..da1298464 100644
--- a/internal/sql/pg/stats_test.go
+++ b/internal/sql/pg/stats_test.go
@@ -5,38 +5,87 @@ package pg
 import (
 	"context"
 	"fmt"
+	"slices"
+	"strconv"
 	"testing"
 
+	"github.com/kwilteam/kwil-db/common/sql"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 
-func TestTableStats(t *testing.T) {
+func mkTestTableDB(t *testing.T) *DB {
 	ctx := context.Background()
-
 	db, err := NewDB(ctx, cfg)
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer db.Close()
+	t.Cleanup(func() {
+		defer db.Close()
+	})
+	return db
+}
 
-	tx, err := db.BeginTx(ctx)
+func mkStatsTestTableTx(t *testing.T, db *DB) sql.PreparedTx {
+	ctx := context.Background()
+	tx, err := db.BeginPreparedTx(ctx)
+	// tx, err := db.BeginTx(ctx)
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer tx.Rollback(ctx)
-
 	tbl := "colcheck"
+	t.Cleanup(func() {
+		defer tx.Rollback(ctx)
+		if t.Failed() {
+			db.AutoCommit(true)
+			db.Execute(ctx, `drop table if exists `+tbl)
+		}
+	})
+
 	_, err = tx.Execute(ctx, `drop table if exists `+tbl)
 	if err != nil {
 		t.Fatal(err)
 	}
 	_, err = tx.Execute(ctx, `create table if not exists `+tbl+
-		` (a int8 primary key, b int4 default 42, c text, d bytea, e numeric(20,5), f int8[], g uint256, h uint256[])`)
+		` (a int8 primary key, b int8 default 42, c text, d bytea, e numeric(20,5), f int4[], g uint256, h uint256[])`)
 	if err != nil {
 		t.Fatal(err)
 	}
 
+	return tx
+}
+
+func TestStatsUpdates(t *testing.T) {
+	ctx := context.Background()
+	db := mkTestTableDB(t)
+	txOuter := mkStatsTestTableTx(t, db)
+
+	txOuter.Execute(ctx, "--ping")
+
+	tx, err := txOuter.BeginTx(ctx)
+	require.NoError(t, err)
+	t.Cleanup(func() {
+		tx.Rollback(ctx)
+	})
+
+	// insert some stuff
+	tbl := `colcheck`
+	_, err = tx.Execute(ctx, `INSERT INTO `+tbl+` VALUES(0, 123, 'asdf')`)
+	require.NoError(t, err)
+
+	err = tx.Commit(ctx)
+	require.NoError(t, err)
+
+	// commit tx
+}
+
+func TestTableStats(t *testing.T) {
+	ctx := context.Background()
+	db := mkTestTableDB(t)
+	tx := mkStatsTestTableTx(t, db)
+
+	tbl := "colcheck"
+
 	cols, err := ColumnInfo(ctx, tx, "", tbl)
 	if err != nil {
 		t.Fatal(err)
@@ -44,11 +93,11 @@ func TestTableStats(t *testing.T) {
 
 	wantCols := []ColInfo{
 		{Pos: 1, Name: "a", DataType: "bigint", Nullable: false},
-		{Pos: 2, Name: "b", DataType: "integer", Nullable: true, defaultVal: "42"},
+		{Pos: 2, Name: "b", DataType: "bigint", Nullable: true, defaultVal: "42"},
 		{Pos: 3, Name: "c", DataType: "text", Nullable: true},
 		{Pos: 4, Name: "d", DataType: "bytea", Nullable: true},
 		{Pos: 5, Name: "e", DataType: "numeric", Nullable: true},
-		{Pos: 6, Name: "f", DataType: "bigint", Array: true, Nullable: true},
+		{Pos: 6, Name: "f", DataType: "integer", Array: true, Nullable: true},
 		{Pos: 7, Name: "g", DataType: "uint256", Nullable: true},
 		{Pos: 8, Name: "h", DataType: "uint256", Array: true, Nullable: true},
 	}
@@ -75,8 +124,76 @@ func TestTableStats(t *testing.T) {
 	fmt.Println(stats.ColumnStatistics[4].Max)
 }
 
+// Test_scanSineBig is similar to Test_updates_demo, but actually uses a DB,
+// inserting data into a table and testing the TableStats function.
+func Test_scanSineBig(t *testing.T) {
+	// Build the full set of values
+	// sine wave with 100 samples per periods, 100 periods
+	const numUpdates = 40000
+	const samplesPerPeriod = 100
+	const ampl = 200.0   // larger => more integer discretization
+	const amplSteps = 10 // "noise" with small ampl variations between periods
+	const amplInc = 2.0  // each step adds a multiple of this to the amplitude
+	vals := makeTestVals(numUpdates, samplesPerPeriod, amplSteps, ampl, amplInc)
+
+	ctx := context.Background()
+
+	db := mkTestTableDB(t)
+	tx := mkStatsTestTableTx(t, db)
+	tbl := `colcheck`
+
+	for i, val := range vals {
+		_, err := tx.Execute(ctx, `INSERT INTO `+tbl+` VALUES($1,$2,$3);`,
+			i, val, strconv.FormatInt(val, 10))
+		require.NoError(t, err)
+	}
+
+	stats, err := TableStats(ctx, "", tbl, tx)
+	require.NoError(t, err)
+
+	require.True(t, stats.RowCount == numUpdates)
+
+	// check the MCVs for the int8 column
+	col := stats.ColumnStatistics[1]
+
+	require.Equal(t, len(col.MCFreqs), statsCap)
+	require.Equal(t, len(col.MCVals), statsCap)
+
+	_, ok := col.MCVals[0].(int64)
+	require.True(t, ok, "wrong value type")
+
+	valsT := convSliceAsserted[int64](col.MCVals)
+	require.True(t, slices.IsSorted(valsT))
+
+	t.Log(valsT)
+	t.Log(col.MCFreqs)
+
+	var totalFreqMCVs int
+	for _, f := range col.MCFreqs {
+		totalFreqMCVs += f
+	}
+	fracMCVs := float64(totalFreqMCVs) / numUpdates
+	t.Log(fracMCVs)
+
+	require.Greater(t, totalFreqMCVs, statsCap) // not just all ones
+	require.LessOrEqual(t, totalFreqMCVs, numUpdates)
+
+	hist := col.Histogram.(histo[int64])
+	t.Log(hist)
+	var totalFreqHist int
+	for _, f := range hist.freqs {
+		totalFreqHist += f
+	}
+	fracHists := float64(totalFreqHist) / numUpdates
+	t.Log(fracHists)
+
+	t.Log(fracMCVs + fracHists)
+
+	t.Log(col.Min.(int64), col.Max.(int64))
+}
+
 /*func TestScanBig(t *testing.T) {
-// This test is commented, but helpful for benchmarking performance with a large table.
+	// This test is commented, but helpful for benchmarking performance with a large table.
 	ctx := context.Background()
 
 	cfg := *cfg
@@ -97,13 +214,13 @@ func TestTableStats(t *testing.T) {
 	defer tx.Rollback(ctx)
 
 	tbl := `giant`
-	cols, err := ColumnInfo(ctx, tx, tbl)
+	cols, err := ColumnInfo(ctx, tx, "", tbl)
 	if err != nil {
 		t.Fatal(err)
 	}
 	t.Logf("%#v", cols)
 
-	stats, err := TableStats(ctx, tbl, tx)
+	stats, err := TableStats(ctx, "", tbl, tx)
 	if err != nil {
 		t.Fatal(err)
 	}
diff --git a/internal/sql/pg/stats_update_test.go b/internal/sql/pg/stats_update_test.go
new file mode 100644
index 000000000..1e5d42fd5
--- /dev/null
+++ b/internal/sql/pg/stats_update_test.go
@@ -0,0 +1,121 @@
+package pg
+
+import (
+	"math"
+	"slices"
+	"testing"
+
+	"github.com/kwilteam/kwil-db/common/sql"
+	"github.com/stretchr/testify/require"
+)
+
+func makeTestVals(num, samplesPerPeriod, amplSteps int, ampl, amplInc float64) []int64 {
+	vals := make([]int64, num)
+	for i := 0; i < num; i++ {
+		p, f := math.Modf(float64(i) / float64(samplesPerPeriod))
+		f *= 2 * math.Pi
+		pMod := math.Mod(p, float64(amplSteps))
+		p = amplInc * pMod // small periodic variation (0,1,2) in amplitude
+		vals[i] = int64(math.Round((ampl + p) * math.Sin(f)))
+	}
+	return vals
+}
+
+// Test_updates_demo tests manual updates to a ColumnStatistics with the generic
+// up* functions, which is how statistics are kept updated in the logical
+// replication stream. Test_scanSineBig tests a full scan with TableStats.
+func Test_updates_demo(t *testing.T) {
+	stats := &sql.ColumnStatistics{}
+
+	// Make data to ingest that:
+	//  - uses the capacity of the MCVs
+	//  - has many repeats
+	//  - >90% (but not all) of values in MCVs
+	// sine wave with 100 samples per periods, 100 periods
+	const numUpdates = 10000
+	const samplesPerPeriod = 100
+	const ampl = 600.0  // larger => more integer discretization
+	const amplSteps = 3 // "noise" with small ampl variations between periods
+	const amplInc = 2.2 // each step adds a multiple of this to the amplitude
+
+	// Build the full set of values
+	vals := makeTestVals(numUpdates, samplesPerPeriod, amplSteps, ampl, amplInc)
+
+	// ensure the test data exceeds the MCV cap
+	fullCounts := make(map[int64]int)
+	for _, v := range vals {
+		fullCounts[v]++
+	}
+	require.Greater(t, len(fullCounts), statsCap)
+
+	// insert one at a time
+	for _, v := range vals {
+		require.NoError(t, upColStatsWithInsert(stats, v))
+	}
+
+	maxVal := slices.Max(vals)
+
+	// min/max must be captured even if not in the MCVs
+	require.Equal(t, -maxVal, stats.Min)
+	require.Equal(t, maxVal, stats.Max)
+
+	hist := stats.Histogram.(histo[int64]) // t.Log("histogram:", hist)
+	histCount0 := hist.TotalCount()
+
+	// insert an outlier
+	require.NoError(t, upColStatsWithInsert(stats, int64(math.MaxInt64)))
+	require.Equal(t, int64(math.MaxInt64), stats.Max) // 9223372036854775807
+	require.Equal(t, histCount0+1, hist.TotalCount()) // one more in the histogram
+
+	// The MCVals slice should be sorted (ascending).
+	mcVals := convSliceAsserted[int64](stats.MCVals)
+	require.Equal(t, len(mcVals), statsCap)
+	require.True(t, slices.IsSorted(mcVals))
+	// outlier must not be in MCVs, which is full
+	require.False(t, slices.Contains(mcVals, int64(math.MaxInt64)))
+
+	// remove the outlier
+	require.NoError(t, upColStatsWithDelete(stats, int64(math.MaxInt64)))
+	require.Equal(t, unknown{}, stats.Max)
+	require.Equal(t, histCount0, hist.TotalCount()) // back to the original counts
+
+	ltVal := int64(0) // WHERE v < 0, about half the values for a sine wave
+	loc, found := slices.BinarySearch(mcVals, ltVal)
+
+	require.True(t, found)
+	require.Equal(t, ltVal, mcVals[loc])
+
+	// for "WHERE v <= 0", use loc++ (if found)
+	loc++
+
+	sumMCFreqs := func() int {
+		var mcvSum, negativeFreqTotal int
+		freqs := make([]float64, len(stats.MCFreqs))
+		for i, f := range stats.MCFreqs {
+			freqs[i] = float64(f) / float64(numUpdates)
+			mcvSum += f
+			if i < loc {
+				negativeFreqTotal += f
+			}
+		}
+
+		// t.Log("total freq of all MCVs:", float64(mcvSum)/float64(numUpdates))
+		// t.Log("sum of freqs where v<=0:", float64(negativeFreqTotal)/float64(numUpdates))
+		return mcvSum
+	}
+
+	mcvSum := sumMCFreqs()
+	totalCounted := mcvSum + hist.TotalCount()
+
+	// every value must be counted in either the MCVs array or the histogram
+	require.Equal(t, numUpdates, totalCounted)
+
+	// remove vals one at a time
+	for _, v := range vals {
+		require.NoError(t, upColStatsWithDelete(stats, v))
+	}
+
+	mcvSum = sumMCFreqs()
+	require.Equal(t, 0, mcvSum)
+	require.Equal(t, 0, hist.TotalCount())
+}
diff --git a/internal/sql/pg/system.go b/internal/sql/pg/system.go
index 91c2f4641..81c90b437 100644
--- a/internal/sql/pg/system.go
+++ b/internal/sql/pg/system.go
@@ -6,11 +6,14 @@ package pg
 import (
 	"cmp"
 	"context"
+	"encoding/gob"
 	"errors"
 	"fmt"
+	"reflect"
 	"slices"
 	"strconv"
 	"strings"
+	"time"
 
 	"github.com/jackc/pgx/v5"
 	"github.com/jackc/pgx/v5/pgtype"
@@ -244,6 +247,111 @@ type ColInfo struct {
 	defaultVal any
 }
 
+func convSliceAsserted[T any](s []any) []T {
+	out := make([]T, len(s))
+	for i := range s {
+		out[i] = s[i].(T)
+	}
+	return out
+}
+
+func convSliceInterfaces[T any](s []T) []any { //nolint
+	out := make([]any, len(s))
+	for i := range s {
+		out[i] = s[i]
+	}
+	return out
+}
+
+// joinSlices is for the case when I use `vals any // []T` rather than `vals []any`
+func joinSlices(s1, s2 any) any { //nolint
+	rt := reflect.TypeOf(s1)
+	if rt == nil || rt.Kind() != reflect.Slice {
+		panic("not a slice")
+	}
+	if rt.Elem() != reflect.TypeOf(s2).Elem() {
+		panic("different element types")
+	}
+
+	rv1 := reflect.ValueOf(s1)
+	rv2 := reflect.ValueOf(s2)
+	l1, l2 := rv1.Len(), rv2.Len()
+	tot := l1 + l2
+
+	s := mkSlice(rt.Elem(), tot)
+	for i := 0; i < l1; i++ {
+		s.Index(i).Set(rv1.Index(i))
+	}
+	for i := 0; i < l2; i++ {
+		s.Index(i + l1).Set(rv2.Index(i))
+	}
+	return s.Interface()
+}
+
+func mkSlice(rt reflect.Type, l int) reflect.Value {
+	st := reflect.SliceOf(rt)
+	return reflect.MakeSlice(st, l, l)
+}
+
+// typeFor returns the reflect.Type that represents the type argument T. TODO:
+// Remove this in favor of reflect.TypeFor when Go 1.22 becomes the minimum
+// required version since it is not available in Go 1.21.
+func typeFor[T any]() reflect.Type { //nolint
+	return reflect.TypeOf((*T)(nil)).Elem()
+}
+
+func statsVal(ct ColType) any {
+	// return reflect.New(statsValType(ct)).Interface()
+	switch ct {
+	case ColTypeInt:
+		return int64(0)
+	case ColTypeText:
+		return string("")
+	case ColTypeBool:
+		return bool(false)
+	case ColTypeByteA:
+		return []byte{}
+	case ColTypeUUID:
+		return new(types.UUID)
+	case ColTypeNumeric:
+		return new(decimal.Decimal)
+	case ColTypeUINT256:
+		return new(types.Uint256)
+	case ColTypeFloat:
+		return float64(0)
+	case ColTypeTime:
+		return time.Time{}
+	default:
+		return nil
+	}
+}
+
+func statsValType(ct ColType) reflect.Type {
+	return reflect.TypeOf(statsVal(ct))
+	/*switch ct {
+	case ColTypeInt:
+		return typeFor[int64]()
+	case ColTypeText:
+		return typeFor[string]()
+	case ColTypeBool:
+		return typeFor[bool]()
+	case ColTypeByteA:
+		return typeFor[[]byte]()
+	case ColTypeUUID:
+		return typeFor[*types.UUID]()
+	case ColTypeNumeric:
+		return typeFor[*decimal.Decimal]()
+	case ColTypeUINT256:
+		return typeFor[*types.Uint256]()
+	case ColTypeFloat:
+		return typeFor[float64]()
+	case ColTypeTime:
+		return typeFor[time.Time]()
+	default:
+		return nil
+	}*/
+}
+
 func scanVal(ct ColType) any {
 	switch ct {
 	case ColTypeInt:
@@ -365,6 +473,13 @@ const (
 	ColTypeUnknown ColType = "unknown"
 )
 
+// register the custom types for gob decoding.
+func init() {
+	gob.RegisterName("kwil_"+string(ColTypeUUID), statsVal(ColTypeUUID))
+	gob.RegisterName("kwil_"+string(ColTypeNumeric), statsVal(ColTypeNumeric))
+	gob.RegisterName("kwil_"+string(ColTypeUINT256), statsVal(ColTypeUINT256))
+}
+
 func arrayType(ct ColType) ColType {
 	switch ct {
 	case ColTypeInt:
@@ -567,11 +682,6 @@ func columnInfo(ctx context.Context, conn *pgx.Conn, schema, tbl string) ([]ColI
 
 // ColumnInfo attempts to describe the columns of a table in a specified
 // PostgreSQL schema. The results are **as reported by information_schema.column**.
-//
-// If the provided sql.Executor is also a ColumnInfoer, its ColumnInfo method
-// will be used. This is primarily for testing with a mocked DB transaction.
-// Otherwise, the Executor must be one of the transaction types created by this
-// package, which provide access to the underlying DB connection.
 func ColumnInfo(ctx context.Context, tx sql.Executor, schema, tbl string) ([]ColInfo, error) {
 	if ti, ok := tx.(conner); ok {
 		conn := ti.Conn()
diff --git a/internal/sql/pg/system_test.go b/internal/sql/pg/system_test.go
index 93f6ef389..0ee409de7 100644
--- a/internal/sql/pg/system_test.go
+++ b/internal/sql/pg/system_test.go
@@ -1,8 +1,14 @@
 package pg
 
 import (
+	"reflect"
 	"testing"
+	"time"
 
+	"github.com/kwilteam/kwil-db/core/types"
+	"github.com/kwilteam/kwil-db/core/types/decimal"
+
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 
@@ -344,3 +350,190 @@ func TestSettingValidFnOR(t *testing.T) {
 		})
 	}
 }
+
+func Test_joinSlices(t *testing.T) {
+	var s0 any = []int{}
+	// t.Logf("%T: %v", s0, s0) // []int: []
+
+	s1t := []int{1, 2, 3, 4}
+	var s1 any = s1t
+	s2t := []int{7, 8, 9}
+	var s2 any = s2t
+
+	// var s3 []int // nope, s3 is any([]int)
+	s3 := joinSlices(s1, s2)
+
+	assert.Equal(t, reflect.TypeOf(s0), reflect.TypeOf(s3))
+	t.Log(reflect.TypeOf(s3))
+
+	s3t, ok := s3.([]int)
+	if !ok {
+		t.Fatalf("not a []int: %T", s3)
+	}
+	// t.Logf("%T: %v", s3, s3) // []int: [1 2 3 4 7 8 9]
+
+	require.Len(t, s3t, len(s1t)+len(s2t))
+	require.EqualValues(t, append(s1t, s2t...), s3t)
+}
+
+func TestStatsVal(t *testing.T) {
+	tests := []struct {
+		name     string
+		colType  ColType
+		expected any
+	}{
+		{
+			name:     "ColTypeInt",
+			colType:  ColTypeInt,
+			expected: int64(0),
+		},
+		{
+			name:     "ColTypeText",
+			colType:  ColTypeText,
+			expected: "",
+		},
+		{
+			name:     "ColTypeBool",
+			colType:  ColTypeBool,
+			expected: false,
+		},
+		{
+			name:     "ColTypeByteA",
+			colType:  ColTypeByteA,
+			expected: []byte{},
+		},
+		{
+			name:     "ColTypeUUID",
+			colType:  ColTypeUUID,
+			expected: &types.UUID{},
+		},
+		{
+			name:     "ColTypeNumeric",
+			colType:  ColTypeNumeric,
+			expected: &decimal.Decimal{},
+		},
+		{
+			name:     "ColTypeUINT256",
+			colType:  ColTypeUINT256,
+			expected: &types.Uint256{},
+		},
+		{
+			name:     "ColTypeFloat",
+			colType:  ColTypeFloat,
+			expected: float64(0),
+		},
+		{
+			name:     "ColTypeTime",
+			colType:  ColTypeTime,
+			expected: time.Time{},
+		},
+		{
+			name:     "Unknown ColType",
+			colType:  ColType("asdfasdf"),
+			expected: nil,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := statsVal(tt.colType)
+			require.IsType(t, tt.expected, result)
+			if tt.expected != nil {
+				require.Equal(t, reflect.TypeOf(tt.expected), reflect.TypeOf(result))
+			}
+		})
+	}
+}
+func TestStatsValType(t *testing.T) {
+	tests := []struct {
+		name     string
+		colType  ColType
+		expected reflect.Type
+	}{
+		{
+			name:     "ColTypeInt",
+			colType:  ColTypeInt,
+			expected: reflect.TypeOf(int64(0)),
+		},
+		{
+			name:     "ColTypeText",
+			colType:  ColTypeText,
+			expected: reflect.TypeOf(""),
+		},
+		{
+			name:     "ColTypeBool",
+			colType:  ColTypeBool,
+			expected: reflect.TypeOf(false),
+		},
+		{
+			name:     "ColTypeByteA",
+			colType:  ColTypeByteA,
+			expected: reflect.TypeOf([]byte(nil)),
+		},
+		{
+			name:     "ColTypeUUID",
+			colType:  ColTypeUUID,
+			expected: reflect.TypeOf(&types.UUID{}),
+		},
+		{
+			name:     "ColTypeNumeric",
+			colType:  ColTypeNumeric,
+			expected: reflect.TypeOf(&decimal.Decimal{}),
+		},
+		{
+			name:     "ColTypeUINT256",
+			colType:  ColTypeUINT256,
+			expected: reflect.TypeOf(&types.Uint256{}),
+		},
+		{
+			name:     "ColTypeFloat",
+			colType:  ColTypeFloat,
+			expected: reflect.TypeOf(float64(0)),
+		},
+		{
+			name:     "ColTypeTime",
+			colType:  ColTypeTime,
+			expected: reflect.TypeOf(time.Time{}),
+		},
+		{
+			name:     "Unknown ColType",
+			colType:  ColType("unknown"),
+			expected: nil,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := statsValType(tt.colType)
+			if tt.expected == nil {
+				require.Nil(t, result)
+			} else {
+				require.Equal(t, tt.expected, result)
+			}
+		})
+	}
+}
+
+func TestStatsValTypeConsistency(t *testing.T) {
+	allTypes := []ColType{
+		ColTypeInt,
+		ColTypeText,
+		ColTypeBool,
+		ColTypeByteA,
+		ColTypeUUID,
+		ColTypeNumeric,
+		ColTypeUINT256,
+		ColTypeFloat,
+		ColTypeTime,
+	}
+
+	for _, ct := range allTypes {
+		t.Run(string(ct), func(t *testing.T) {
+			valType := statsValType(ct)
+			val := statsVal(ct)
+			require.NotNil(t, valType)
+			require.NotNil(t, val)
+			require.Equal(t, valType, reflect.TypeOf(val))
+		})
+	}
+}
diff --git a/internal/sql/pg/types_test.go b/internal/sql/pg/types_test.go
index b4c0142d6..fd16407d0 100644
--- a/internal/sql/pg/types_test.go
+++ b/internal/sql/pg/types_test.go
@@ -5,8 +5,37 @@ import (
 	"testing"
 
 	"github.com/stretchr/testify/require"
+
+	"github.com/kwilteam/kwil-db/core/types"
+	"github.com/kwilteam/kwil-db/core/types/decimal"
 )
 
+// mustDecimal panics if the string cannot be converted to a decimal.
+func mustDecimal(s string) *decimal.Decimal {
+	d, err := decimal.NewFromString(s)
+	if err != nil {
+		panic(err)
+	}
+	return d
+}
+
+func mustParseUUID(s string) *types.UUID {
+	u, err := types.ParseUUID(s)
+	if err != nil {
+		panic(err)
+	}
+	return u
+}
+
+// mustUint256 panics if the string cannot be converted to a Uint256.
+func mustUint256(s string) *types.Uint256 {
+	u, err := types.Uint256FromString(s)
+	if err != nil {
+		panic(err)
+	}
+	return u
+}
+
 func Test_ArrayEncodeDecode(t *testing.T) {
 	arr := []string{"a", "b", "c"}
 	res, err := serializeArray(arr, 4, func(s string) ([]byte, error) {