Skip to content

Commit

Permalink
pg: stats, mvcs, histo
Browse files Browse the repository at this point in the history
  • Loading branch information
jchappelow committed Aug 23, 2024
1 parent 4ece583 commit 8e97ef3
Show file tree
Hide file tree
Showing 13 changed files with 2,315 additions and 106 deletions.
143 changes: 130 additions & 13 deletions common/sql/statistics.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ type Statistics struct {
RowCount int64

ColumnStatistics []ColumnStatistics

//Selectivity, for plan statistics
// NOTE: above may be better as []any to work with a generic ColStatsT[T]
}

func (s *Statistics) String() string {
Expand All @@ -27,17 +26,18 @@ func (s *Statistics) String() string {
}
for i, cs := range s.ColumnStatistics {
fmt.Fprintf(&st, " Column %d:\n", i)
fmt.Fprintf(&st, " - Min/Max = %v / %v\n", cs.Min, cs.Max)
if _, ok := cs.Min.(string); ok {
fmt.Fprintf(&st, " - Min/Max = %.64s / %.64s\n", cs.Min, cs.Max)
} else {
fmt.Fprintf(&st, " - Min/Max = %v / %v\n", cs.Min, cs.Max)
}
fmt.Fprintf(&st, " - NULL count = %v\n", cs.NullCount)
fmt.Fprintf(&st, " - Num MCVs = %v\n", len(cs.MCFreqs))
fmt.Fprintf(&st, " - Histogram = {%v}\n", cs.Histogram) // it's any, but also a fmt.Stringer
}
return st.String()
}

type ValCount struct {
Val any
Count int
}

// ColumnStatistics contains statistics about a column.
type ColumnStatistics struct {
NullCount int64
Expand All @@ -58,16 +58,133 @@ type ColumnStatistics struct {
// MCVs []ValCount
// MCVs map[cmp.Ordered]

// MCVals []any
// MCFreqs []int
MCVals []any // []T
MCFreqs []int
// ^ NOTE: MCVals was easier in many ways with just any.([]T), but other
// ways much more inconvenient, so we have it as an []any. May go back.

// DistinctCount is harder. For example, unless we sub-sample
// (deterministically), tracking distinct values could involve a data
// structure with the same number of elements as rows in the table.
DistinctCount int64
// or sophisticated a algo e.g. https://github.com/axiomhq/hyperloglog
// DistinctCount int64
// alt, -1 means don't know

// AvgSize can affect cost as it changes the number of "pages" in postgres
// terminology, representing the size of data returned or processed by an
// expression.
AvgSize int64 // maybe: length of text, length of array, otherwise not used for scalar?

// without histogram, we can make uniformity assumption to simplify the cost model
//Histogram []HistogramBucket
Histogram any // histo[T]
}

/* Perhaps I should have started fresh with a fully generic column stats struct... under consideration.
type ColStatsT[T any] struct {
NullCount int
Min T
MinCount int
Max T
MaxCount int
MCVals []T
MCFreqs []int
}
*/

func NewEmptyStatistics(numCols int) *Statistics {
return &Statistics{
RowCount: 0,
ColumnStatistics: make([]ColumnStatistics, numCols),
}
}

// ALL of the following types are from the initial query plan draft PR by Yaiba.
// Only TableRef gets much use in the current statistics work. An integration
// branch uses the other field and schema types a bit more, but it's easy to
// change any of this...

// TableRef is a PostgreSQL-schema-qualified table name.
type TableRef struct {
Namespace string // e.g. schema in Postgres, derived from Kwil dataset schema DBID
Table string
}

// String returns the fully qualified table name as "namepace.table" if
// Namespace is set, otherwise it just returns the table name.
func (t *TableRef) String() string {
if t.Namespace != "" {
return fmt.Sprintf("%s.%s", t.Namespace, t.Table)
}
return t.Table
}

type ColumnDef struct {
Relation *TableRef
Name string
}

func ColumnUnqualified(name string) *ColumnDef {
return &ColumnDef{Name: name}
}

func Column(table *TableRef, name string) *ColumnDef {
return &ColumnDef{Relation: table, Name: name}
}

// Field represents a field (column) in a schema.
type Field struct {
Rel *TableRef

Name string
Type string
Nullable bool
HasIndex bool
}

func NewField(name string, dataType string, nullable bool) Field {
return Field{Name: name, Type: dataType, Nullable: nullable}
}

func NewFieldWithRelation(name string, dataType string, nullable bool, relation *TableRef) Field {
return Field{Name: name, Type: dataType, Nullable: nullable, Rel: relation}
}

func (f *Field) Relation() *TableRef {
return f.Rel
}

func (f *Field) QualifiedColumn() *ColumnDef {
return Column(f.Rel, f.Name)
}

// Schema represents a database as a slice of all columns in all relations. See
// also Field.
type Schema struct {
Fields []Field
}

func NewSchema(fields ...Field) *Schema {
return &Schema{Fields: fields}
}

func NewSchemaQualified(relation *TableRef, fields ...Field) *Schema {
for i := range fields {
fields[i].Rel = relation
}
return &Schema{Fields: fields}
}

func (s *Schema) String() string {
var fields []string
for _, f := range s.Fields {
fields = append(fields, fmt.Sprintf("%s/%s", f.Name, f.Type))
}
return fmt.Sprintf("[%s]", strings.Join(fields, ", "))
}

type DataSource interface {
// Schema returns the schema for the underlying data source
Schema() *Schema
// Statistics returns the statistics of the data source.
Statistics() *Statistics
}
8 changes: 8 additions & 0 deletions core/types/uuid.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package types

import (
"bytes"
"database/sql"
"database/sql/driver"
"encoding/json"
Expand All @@ -14,6 +15,13 @@ var namespace = uuid.MustParse("cc1cd90f-b4db-47f4-b6df-4bbe5fca88eb")
// UUID is a rfc4122 compliant uuidv5
type UUID [16]byte

// CmpUUID compares two UUIDs, returning 0 if equal, -1 if u<v, and 1 if u>v.
// This satisfies the comparison function required by many generic functions in
// the standard library and Kwil.
func CmpUUID(u, v UUID) int {
return bytes.Compare(u[:], v[:])
}

// NewUUIDV5 generates a uuidv5 from a byte slice.
// This is used to deterministically generate uuids.
func NewUUIDV5(from []byte) *UUID {
Expand Down
36 changes: 2 additions & 34 deletions internal/sql/pg/db_live_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"cmp"
"context"
"fmt"
"os"
"reflect"
"slices"
"strconv"
Expand All @@ -28,7 +29,7 @@ import (

func TestMain(m *testing.M) {
// UseLogger(log.NewStdOut(log.InfoLevel))
m.Run()
os.Exit(m.Run())
}

const (
Expand Down Expand Up @@ -266,13 +267,6 @@ func TestNULL(t *testing.T) {
require.Equal(t, bvn.Int64, insB)
}

// typeFor returns the reflect.Type that represents the type argument T. TODO:
// Remove this in favor of reflect.TypeFor when Go 1.22 becomes the minimum
// required version since it is not available in Go 1.21.
func typeFor[T any]() reflect.Type {
return reflect.TypeOf((*T)(nil)).Elem()
}

func TestScanVal(t *testing.T) {
cols := []ColInfo{
{Pos: 1, Name: "a", DataType: "bigint", Nullable: false},
Expand Down Expand Up @@ -974,32 +968,6 @@ func TestTypeRoundtrip(t *testing.T) {
}
}

// mustDecimal panics if the string cannot be converted to a decimal.
func mustDecimal(s string) *decimal.Decimal {
d, err := decimal.NewFromString(s)
if err != nil {
panic(err)
}
return d
}

func mustParseUUID(s string) *types.UUID {
u, err := types.ParseUUID(s)
if err != nil {
panic(err)
}
return u
}

// mustUint256 panics if the string cannot be converted to a Uint256.
func mustUint256(s string) *types.Uint256 {
u, err := types.Uint256FromString(s)
if err != nil {
panic(err)
}
return u
}

func Test_DelayedTx(t *testing.T) {
ctx := context.Background()

Expand Down
Loading

0 comments on commit 8e97ef3

Please sign in to comment.