-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(influx_tools): Add export to parquet files #25297
Open
srebhan
wants to merge
14
commits into
influxdata:master-1.x
Choose a base branch
from
srebhan:v1-bulk-exporter-parquet
base: master-1.x
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 11 commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
46aef0b
feat(influx_tools): Add export to parquet files
srebhan 9ed1d01
chore: Wrap errors in influx_tools main
srebhan c12f293
chore: Do not create unused series cursor and simplify batcher creation
srebhan a2367ee
chore: Move converter creation to batcher as it is only used there
srebhan 41dacce
fix: Caputure error when closing series cursor
srebhan b7c9475
feat: Print shard series-file path on error
srebhan 182195f
chore: Replace panic by returning an error
srebhan 795e581
feat: Use logger instead of raw printing
srebhan 59b60e6
fix: Caputure error when closing exporter
srebhan 390cf30
fix: Caputure more defer errors
srebhan 3bfe17c
feat: Detect name conflicts after name resolution
srebhan 76a88d1
fix: Make sure deferred functions are actually called
srebhan f2423af
feat: Move out cursor handling
srebhan a7d0f1b
feat: Preallocate maps and slices
srebhan File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
package parquet | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"sort" | ||
|
||
"github.com/influxdata/influxdb/models" | ||
"github.com/influxdata/influxdb/tsdb" | ||
"github.com/influxdata/influxql" | ||
"go.uber.org/zap" | ||
) | ||
|
||
type row struct { | ||
timestamp int64 | ||
tags map[string]string | ||
fields map[string]interface{} | ||
} | ||
|
||
type batcher struct { | ||
measurement []byte | ||
shard *tsdb.Shard | ||
|
||
typeResolutions map[string]influxql.DataType | ||
converter map[string]func(interface{}) (interface{}, error) | ||
nameResolutions map[string]string | ||
|
||
series []seriesEntry | ||
start int64 | ||
|
||
logger *zap.SugaredLogger | ||
} | ||
|
||
func (b *batcher) init() error { | ||
// Setup the type converters for the conflicting fields | ||
b.converter = make(map[string]func(interface{}) (interface{}, error), len(b.typeResolutions)) | ||
for field, ftype := range b.typeResolutions { | ||
switch ftype { | ||
case influxql.Float: | ||
b.converter[field] = toFloat | ||
case influxql.Unsigned: | ||
b.converter[field] = toUint | ||
case influxql.Integer: | ||
b.converter[field] = toInt | ||
case influxql.Boolean: | ||
b.converter[field] = toBool | ||
case influxql.String: | ||
b.converter[field] = toString | ||
default: | ||
return fmt.Errorf("unknown converter %v for field %q", ftype, field) | ||
} | ||
} | ||
|
||
b.start = models.MinNanoTime | ||
|
||
return nil | ||
} | ||
|
||
func (b *batcher) reset() { | ||
b.start = models.MinNanoTime | ||
} | ||
|
||
func (b *batcher) next(ctx context.Context) ([]row, error) { | ||
// Iterate over the series and fields and accumulate the data row-wise | ||
iter, err := b.shard.CreateCursorIterator(ctx) | ||
if err != nil { | ||
return nil, fmt.Errorf("getting cursor iterator for %q failed: %w", string(b.measurement), err) | ||
} | ||
|
||
data := make(map[string]map[int64]row) | ||
end := models.MaxNanoTime | ||
for _, s := range b.series { | ||
data[s.key] = make(map[int64]row) | ||
tags := make(map[string]string, len(s.tags)) | ||
for _, t := range s.tags { | ||
tags[string(t.Key)] = string(t.Value) | ||
} | ||
for field := range s.fields { | ||
cursor, err := iter.Next(ctx, | ||
&tsdb.CursorRequest{ | ||
Name: b.measurement, | ||
Tags: s.tags, | ||
Field: field, | ||
Ascending: true, | ||
StartTime: b.start, | ||
EndTime: models.MaxNanoTime, | ||
}, | ||
) | ||
if err != nil { | ||
return nil, fmt.Errorf("getting cursor for %s-%s failed: %w", s.key, field, err) | ||
} | ||
if cursor == nil { | ||
continue | ||
} | ||
|
||
// Prepare mappings | ||
fname := field | ||
if n, found := b.nameResolutions[field]; found { | ||
fname = n | ||
} | ||
converter := identity | ||
if c, found := b.converter[field]; found { | ||
converter = c | ||
} | ||
fieldEnd := models.MaxNanoTime | ||
switch c := cursor.(type) { | ||
case tsdb.IntegerArrayCursor: | ||
values := c.Next() | ||
for i, t := range values.Timestamps { | ||
v, err := converter(values.Values[i]) | ||
if err != nil { | ||
b.logger.Errorf("converting %v of field %q failed: %v", values.Values[i], field, err) | ||
continue | ||
} | ||
|
||
if _, found := data[s.key][t]; !found { | ||
data[s.key][t] = row{ | ||
timestamp: t, | ||
tags: tags, | ||
fields: make(map[string]interface{}), | ||
} | ||
} | ||
|
||
data[s.key][t].fields[fname] = v | ||
fieldEnd = t | ||
} | ||
case tsdb.FloatArrayCursor: | ||
values := c.Next() | ||
for i, t := range values.Timestamps { | ||
v, err := converter(values.Values[i]) | ||
if err != nil { | ||
b.logger.Errorf("converting %v of field %q failed: %v", values.Values[i], field, err) | ||
continue | ||
} | ||
|
||
if _, found := data[s.key][t]; !found { | ||
data[s.key][t] = row{ | ||
timestamp: t, | ||
tags: tags, | ||
fields: make(map[string]interface{}), | ||
} | ||
} | ||
|
||
data[s.key][t].fields[fname] = v | ||
fieldEnd = t | ||
} | ||
case tsdb.UnsignedArrayCursor: | ||
values := c.Next() | ||
for i, t := range values.Timestamps { | ||
v, err := converter(values.Values[i]) | ||
if err != nil { | ||
b.logger.Errorf("converting %v of field %q failed: %v", values.Values[i], field, err) | ||
continue | ||
} | ||
|
||
if _, found := data[s.key][t]; !found { | ||
data[s.key][t] = row{ | ||
timestamp: t, | ||
tags: tags, | ||
fields: make(map[string]interface{}), | ||
} | ||
} | ||
|
||
data[s.key][t].fields[fname] = v | ||
fieldEnd = t | ||
} | ||
case tsdb.BooleanArrayCursor: | ||
values := c.Next() | ||
for i, t := range values.Timestamps { | ||
v, err := converter(values.Values[i]) | ||
if err != nil { | ||
b.logger.Errorf("converting %v of field %q failed: %v", values.Values[i], field, err) | ||
continue | ||
} | ||
|
||
if _, found := data[s.key][t]; !found { | ||
data[s.key][t] = row{ | ||
timestamp: t, | ||
tags: tags, | ||
fields: make(map[string]interface{}), | ||
} | ||
} | ||
|
||
data[s.key][t].fields[fname] = v | ||
fieldEnd = t | ||
} | ||
case tsdb.StringArrayCursor: | ||
values := c.Next() | ||
for i, t := range values.Timestamps { | ||
v, err := converter(values.Values[i]) | ||
if err != nil { | ||
b.logger.Errorf("converting %v of field %q failed: %v", values.Values[i], field, err) | ||
continue | ||
} | ||
|
||
if _, found := data[s.key][t]; !found { | ||
data[s.key][t] = row{ | ||
timestamp: t, | ||
tags: tags, | ||
fields: make(map[string]interface{}), | ||
} | ||
} | ||
|
||
data[s.key][t].fields[fname] = v | ||
fieldEnd = t | ||
} | ||
default: | ||
cursor.Close() | ||
return nil, fmt.Errorf("unexpected type %T", cursor) | ||
} | ||
cursor.Close() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No it does not. |
||
end = min(end, fieldEnd) | ||
} | ||
} | ||
if len(data) == 0 { | ||
return nil, nil | ||
} | ||
|
||
// Extract the rows ordered by timestamp | ||
var rows []row | ||
srebhan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for _, tmap := range data { | ||
for _, r := range tmap { | ||
rows = append(rows, r) | ||
} | ||
} | ||
sort.Slice(rows, func(i, j int) bool { return rows[i].timestamp < rows[j].timestamp }) | ||
|
||
// Only include rows that are before the end-timestamp to avoid duplicate | ||
// or incomplete entries due to not iterating through all data | ||
n := sort.Search(len(rows), func(i int) bool { return rows[i].timestamp > end }) | ||
|
||
// Remember the earliest datum to use this for the next batch excluding the entry itself | ||
b.start = end + 1 | ||
|
||
return rows[:n], nil | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could the cases in this type switch call a single generic function to reduce duplicate code? I'm not sure, but that would simplify maintenance and readability.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried my very best, but due to the fact that each implementation returns a different type and function signature for
Next()
I could not find a way to make this generic. Do you have some idea?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I figured out how to have implement a generic
Next
function, but the access to the Values and Timestamps fields are the problem. Even with type constraints that all have two fields namedValues
andTimestamps
the compiler inference fails.https://goplay.tools/snippet/_BsjhhcYGGB