pg: stats, mvcs, histo

kwilteam · Aug 23, 2024 · 8e97ef3 · 8e97ef3
1 parent 4ece583
commit 8e97ef3
Show file tree

Hide file tree

Showing 13 changed files with 2,315 additions and 106 deletions.
diff --git a/common/sql/statistics.go b/common/sql/statistics.go
@@ -15,8 +15,7 @@ type Statistics struct {
 	RowCount int64
 
 	ColumnStatistics []ColumnStatistics
-
-	//Selectivity, for plan statistics
+	// NOTE: above may be better as []any to work with a generic ColStatsT[T]
 }
 
 func (s *Statistics) String() string {
@@ -27,17 +26,18 @@ func (s *Statistics) String() string {
 	}
 	for i, cs := range s.ColumnStatistics {
 		fmt.Fprintf(&st, " Column %d:\n", i)
-		fmt.Fprintf(&st, " - Min/Max = %v / %v\n", cs.Min, cs.Max)
+		if _, ok := cs.Min.(string); ok {
+			fmt.Fprintf(&st, " - Min/Max = %.64s / %.64s\n", cs.Min, cs.Max)
+		} else {
+			fmt.Fprintf(&st, " - Min/Max = %v / %v\n", cs.Min, cs.Max)
+		}
 		fmt.Fprintf(&st, " - NULL count = %v\n", cs.NullCount)
+		fmt.Fprintf(&st, " - Num MCVs = %v\n", len(cs.MCFreqs))
+		fmt.Fprintf(&st, " - Histogram = {%v}\n", cs.Histogram) // it's any, but also a fmt.Stringer
 	}
 	return st.String()
 }
 
-type ValCount struct {
-	Val   any
-	Count int
-}
-
 // ColumnStatistics contains statistics about a column.
 type ColumnStatistics struct {
 	NullCount int64
@@ -58,16 +58,133 @@ type ColumnStatistics struct {
 	// MCVs []ValCount
 	// MCVs map[cmp.Ordered]
 
-	// MCVals  []any
-	// MCFreqs []int
+	MCVals  []any // []T
+	MCFreqs []int
+	// ^ NOTE: MCVals was easier in many ways with just any.([]T), but other
+	// ways much more inconvenient, so we have it as an []any.  May go back.
 
 	// DistinctCount is harder. For example, unless we sub-sample
 	// (deterministically), tracking distinct values could involve a data
 	// structure with the same number of elements as rows in the table.
-	DistinctCount int64
+	// or sophisticated a algo e.g. https://github.com/axiomhq/hyperloglog
+	// DistinctCount int64
+	// alt, -1 means don't know
 
+	// AvgSize can affect cost as it changes the number of "pages" in postgres
+	// terminology, representing the size of data returned or processed by an
+	// expression.
 	AvgSize int64 // maybe: length of text, length of array, otherwise not used for scalar?
 
-	// without histogram, we can make uniformity assumption to simplify the cost model
-	//Histogram     []HistogramBucket
+	Histogram any // histo[T]
+}
+
+/* Perhaps I should have started fresh with a fully generic column stats struct... under consideration.
+
+type ColStatsT[T any] struct {
+	NullCount int
+	Min       T
+	MinCount  int
+	Max       T
+	MaxCount  int
+	MCVals    []T
+	MCFreqs   []int
+}
+*/
+
+func NewEmptyStatistics(numCols int) *Statistics {
+	return &Statistics{
+		RowCount:         0,
+		ColumnStatistics: make([]ColumnStatistics, numCols),
+	}
+}
+
+// ALL of the following types are from the initial query plan draft PR by Yaiba.
+// Only TableRef gets much use in the current statistics work. An integration
+// branch uses the other field and schema types a bit more, but it's easy to
+// change any of this...
+
+// TableRef is a PostgreSQL-schema-qualified table name.
+type TableRef struct {
+	Namespace string // e.g. schema in Postgres, derived from Kwil dataset schema DBID
+	Table     string
+}
+
+// String returns the fully qualified table name as "namepace.table" if
+// Namespace is set, otherwise it just returns the table name.
+func (t *TableRef) String() string {
+	if t.Namespace != "" {
+		return fmt.Sprintf("%s.%s", t.Namespace, t.Table)
+	}
+	return t.Table
+}
+
+type ColumnDef struct {
+	Relation *TableRef
+	Name     string
+}
+
+func ColumnUnqualified(name string) *ColumnDef {
+	return &ColumnDef{Name: name}
+}
+
+func Column(table *TableRef, name string) *ColumnDef {
+	return &ColumnDef{Relation: table, Name: name}
+}
+
+// Field represents a field (column) in a schema.
+type Field struct {
+	Rel *TableRef
+
+	Name     string
+	Type     string
+	Nullable bool
+	HasIndex bool
+}
+
+func NewField(name string, dataType string, nullable bool) Field {
+	return Field{Name: name, Type: dataType, Nullable: nullable}
+}
+
+func NewFieldWithRelation(name string, dataType string, nullable bool, relation *TableRef) Field {
+	return Field{Name: name, Type: dataType, Nullable: nullable, Rel: relation}
+}
+
+func (f *Field) Relation() *TableRef {
+	return f.Rel
+}
+
+func (f *Field) QualifiedColumn() *ColumnDef {
+	return Column(f.Rel, f.Name)
+}
+
+// Schema represents a database as a slice of all columns in all relations. See
+// also Field.
+type Schema struct {
+	Fields []Field
+}
+
+func NewSchema(fields ...Field) *Schema {
+	return &Schema{Fields: fields}
+}
+
+func NewSchemaQualified(relation *TableRef, fields ...Field) *Schema {
+	for i := range fields {
+		fields[i].Rel = relation
+	}
+	return &Schema{Fields: fields}
+}
+
+func (s *Schema) String() string {
+	var fields []string
+	for _, f := range s.Fields {
+		fields = append(fields, fmt.Sprintf("%s/%s", f.Name, f.Type))
+	}
+	return fmt.Sprintf("[%s]", strings.Join(fields, ", "))
+}
+
+type DataSource interface {
+	// Schema returns the schema for the underlying data source
+	Schema() *Schema
+	// Statistics returns the statistics of the data source.
+	Statistics() *Statistics
 }
diff --git a/core/types/uuid.go b/core/types/uuid.go
@@ -1,6 +1,7 @@
 package types
 
 import (
+	"bytes"
 	"database/sql"
 	"database/sql/driver"
 	"encoding/json"
@@ -14,6 +15,13 @@ var namespace = uuid.MustParse("cc1cd90f-b4db-47f4-b6df-4bbe5fca88eb")
 // UUID is a rfc4122 compliant uuidv5
 type UUID [16]byte
 
+// CmpUUID compares two UUIDs, returning 0 if equal, -1 if u<v, and 1 if u>v.
+// This satisfies the comparison function required by many generic functions in
+// the standard library and Kwil.
+func CmpUUID(u, v UUID) int {
+	return bytes.Compare(u[:], v[:])
+}
+
 // NewUUIDV5 generates a uuidv5 from a byte slice.
 // This is used to deterministically generate uuids.
 func NewUUIDV5(from []byte) *UUID {

diff --git a/internal/sql/pg/db_live_test.go b/internal/sql/pg/db_live_test.go
@@ -7,6 +7,7 @@ import (
 	"cmp"
 	"context"
 	"fmt"
+	"os"
 	"reflect"
 	"slices"
 	"strconv"
@@ -28,7 +29,7 @@ import (
 
 func TestMain(m *testing.M) {
 	// UseLogger(log.NewStdOut(log.InfoLevel))
-	m.Run()
+	os.Exit(m.Run())
 }
 
 const (
@@ -266,13 +267,6 @@ func TestNULL(t *testing.T) {
 	require.Equal(t, bvn.Int64, insB)
 }
 
-// typeFor returns the reflect.Type that represents the type argument T. TODO:
-// Remove this in favor of reflect.TypeFor when Go 1.22 becomes the minimum
-// required version since it is not available in Go 1.21.
-func typeFor[T any]() reflect.Type {
-	return reflect.TypeOf((*T)(nil)).Elem()
-}
-
 func TestScanVal(t *testing.T) {
 	cols := []ColInfo{
 		{Pos: 1, Name: "a", DataType: "bigint", Nullable: false},
@@ -974,32 +968,6 @@ func TestTypeRoundtrip(t *testing.T) {
 	}
 }
 
-// mustDecimal panics if the string cannot be converted to a decimal.
-func mustDecimal(s string) *decimal.Decimal {
-	d, err := decimal.NewFromString(s)
-	if err != nil {
-		panic(err)
-	}
-	return d
-}
-
-func mustParseUUID(s string) *types.UUID {
-	u, err := types.ParseUUID(s)
-	if err != nil {
-		panic(err)
-	}
-	return u
-}
-
-// mustUint256 panics if the string cannot be converted to a Uint256.
-func mustUint256(s string) *types.Uint256 {
-	u, err := types.Uint256FromString(s)
-	if err != nil {
-		panic(err)
-	}
-	return u
-}
-
 func Test_DelayedTx(t *testing.T) {
 	ctx := context.Background()