Skip to content

Commit

Permalink
pg: stats, mvcs, histo
Browse files Browse the repository at this point in the history
  • Loading branch information
jchappelow committed Aug 22, 2024
1 parent 4ece583 commit a2407dc
Show file tree
Hide file tree
Showing 13 changed files with 2,352 additions and 100 deletions.
149 changes: 142 additions & 7 deletions common/sql/statistics.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (
type Statistics struct {
RowCount int64

ColumnStatistics []ColumnStatistics
ColumnStatistics []ColumnStatistics // maybe []any to work with generic ColumnStats[T any]

//Selectivity, for plan statistics
}
Expand All @@ -27,8 +27,13 @@ func (s *Statistics) String() string {
}
for i, cs := range s.ColumnStatistics {
fmt.Fprintf(&st, " Column %d:\n", i)
fmt.Fprintf(&st, " - Min/Max = %v / %v\n", cs.Min, cs.Max)
if _, ok := cs.Min.(string); ok {
fmt.Fprintf(&st, " - Min/Max = %.64s / %.64s\n", cs.Min, cs.Max)
} else {
fmt.Fprintf(&st, " - Min/Max = %v / %v\n", cs.Min, cs.Max)
}
fmt.Fprintf(&st, " - NULL count = %v\n", cs.NullCount)
fmt.Fprintf(&st, " - Num MCVs = %v\n", len(cs.MCFreqs))
}
return st.String()
}
Expand All @@ -38,6 +43,26 @@ type ValCount struct {
Count int
}

type Stats struct {
RowCount int
ColTypes []string // pg.ColType or []reflect.Type ?
ColStats []any // []ColStatsT[T], each col has different T
}

/*type ColStatsT[T any] struct {
NullCount int
Min T
MinCount int
Max T
MaxCount int
MCVals []T
MCFreqs []int
}
func NewColStatsT[T any](T) *ColStatsT[T] {
return &ColStatsT[T]{}
}*/

// ColumnStatistics contains statistics about a column.
type ColumnStatistics struct {
NullCount int64
Expand All @@ -58,16 +83,126 @@ type ColumnStatistics struct {
// MCVs []ValCount
// MCVs map[cmp.Ordered]

// MCVals []any
// MCFreqs []int
MCVals []any // []T
MCFreqs []int
// ^ NOTE: MCVals was easier in many ways with just any.([]T), but other
// ways much more inconvenient, so we have it as an []any. May go back.

// DistinctCount is harder. For example, unless we sub-sample
// (deterministically), tracking distinct values could involve a data
// structure with the same number of elements as rows in the table.
DistinctCount int64
// or sophisticated a algo e.g. https://github.com/axiomhq/hyperloglog
// DistinctCount int64
// alt, -1 means don't know

// AvgSize can affect cost as it changes the number of "pages" in postgres
// terminology, representing the size of data returned or processed by an
// expression.
AvgSize int64 // maybe: length of text, length of array, otherwise not used for scalar?

// without histogram, we can make uniformity assumption to simplify the cost model
//Histogram []HistogramBucket
Histogram any // histo[T]
}

func (s *Statistics) ColStat(index int) *ColumnStatistics {
return &s.ColumnStatistics[index]
}

func NewStatistics(rowCount int64, colStats []ColumnStatistics) *Statistics {
return &Statistics{
RowCount: rowCount,
ColumnStatistics: colStats,
}
}

func NewEmptyStatistics() *Statistics {
return &Statistics{
RowCount: 0,
ColumnStatistics: nil,
}
}

// TableRef is a PostgreSQL-schema-qualified table name.
type TableRef struct {
Namespace string // e.g. schema in Postgres, derived from Kwil dataset schema DBID
Table string
}

func (t *TableRef) String() string {
if t.Namespace != "" {
return fmt.Sprintf("%s.%s", t.Namespace, t.Table)
}
return t.Table
}

type ColumnDef struct {
Relation *TableRef
Name string
}

func ColumnUnqualified(name string) *ColumnDef {
return &ColumnDef{Name: name}
}

func Column(table *TableRef, name string) *ColumnDef {
return &ColumnDef{Relation: table, Name: name}
}

// Field represents a field (column) in a schema.
type Field struct {
Rel *TableRef // relation, maybe not pointer?

Name string
Type string
Nullable bool
HasIndex bool
}

func NewField(name string, dataType string, nullable bool) Field {
return Field{Name: name, Type: dataType, Nullable: nullable}
}

func NewFieldWithRelation(name string, dataType string, nullable bool, relation *TableRef) Field {
return Field{Name: name, Type: dataType, Nullable: nullable, Rel: relation}
}

func (f *Field) Relation() *TableRef {
return f.Rel
}

func (f *Field) QualifiedColumn() *ColumnDef {
return Column(f.Rel, f.Name)
}

// Schema represents a database as a slice of all columns in all relations. See
// also Field.
type Schema struct {
Fields []Field
// index
//Indexes []Index
}

func NewSchema(fields ...Field) *Schema {
return &Schema{Fields: fields}
}

func NewSchemaQualified(relation *TableRef, fields ...Field) *Schema {
for i := range fields {
fields[i].Rel = relation
}
return &Schema{Fields: fields}
}

func (s *Schema) String() string {
var fields []string
for _, f := range s.Fields {
fields = append(fields, fmt.Sprintf("%s/%s", f.Name, f.Type))
}
return fmt.Sprintf("[%s]", strings.Join(fields, ", "))
}

type DataSource interface {
// Schema returns the schema for the underlying data source
Schema() *Schema
// Statistics returns the statistics of the data source.
Statistics() *Statistics
}
5 changes: 5 additions & 0 deletions core/types/uuid.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package types

import (
"bytes"
"database/sql"
"database/sql/driver"
"encoding/json"
Expand All @@ -14,6 +15,10 @@ var namespace = uuid.MustParse("cc1cd90f-b4db-47f4-b6df-4bbe5fca88eb")
// UUID is a rfc4122 compliant uuidv5
type UUID [16]byte

func CmpUUID(u, v UUID) int {
return bytes.Compare(u[:], v[:])
}

// NewUUIDV5 generates a uuidv5 from a byte slice.
// This is used to deterministically generate uuids.
func NewUUIDV5(from []byte) *UUID {
Expand Down
36 changes: 2 additions & 34 deletions internal/sql/pg/db_live_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"cmp"
"context"
"fmt"
"os"
"reflect"
"slices"
"strconv"
Expand All @@ -28,7 +29,7 @@ import (

func TestMain(m *testing.M) {
// UseLogger(log.NewStdOut(log.InfoLevel))
m.Run()
os.Exit(m.Run())
}

const (
Expand Down Expand Up @@ -266,13 +267,6 @@ func TestNULL(t *testing.T) {
require.Equal(t, bvn.Int64, insB)
}

// typeFor returns the reflect.Type that represents the type argument T. TODO:
// Remove this in favor of reflect.TypeFor when Go 1.22 becomes the minimum
// required version since it is not available in Go 1.21.
func typeFor[T any]() reflect.Type {
return reflect.TypeOf((*T)(nil)).Elem()
}

func TestScanVal(t *testing.T) {
cols := []ColInfo{
{Pos: 1, Name: "a", DataType: "bigint", Nullable: false},
Expand Down Expand Up @@ -974,32 +968,6 @@ func TestTypeRoundtrip(t *testing.T) {
}
}

// mustDecimal panics if the string cannot be converted to a decimal.
func mustDecimal(s string) *decimal.Decimal {
d, err := decimal.NewFromString(s)
if err != nil {
panic(err)
}
return d
}

func mustParseUUID(s string) *types.UUID {
u, err := types.ParseUUID(s)
if err != nil {
panic(err)
}
return u
}

// mustUint256 panics if the string cannot be converted to a Uint256.
func mustUint256(s string) *types.Uint256 {
u, err := types.Uint256FromString(s)
if err != nil {
panic(err)
}
return u
}

func Test_DelayedTx(t *testing.T) {
ctx := context.Background()

Expand Down
Loading

0 comments on commit a2407dc

Please sign in to comment.