Skip to content

Commit

Permalink
Upgrade ZSV library (#2402)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

Upgrade ZSV library

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <[email protected]>
  • Loading branch information
JinHai-CN authored Dec 24, 2024
1 parent 2df22b5 commit ede4d2d
Show file tree
Hide file tree
Showing 34 changed files with 1,291 additions and 877 deletions.
2 changes: 1 addition & 1 deletion third_party/versions
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ spdlog v1.10.0
simdjson v3.1.8
tlx v0.6.1
nlohmann v3.11.3
zsv commit-id:5c22aae4363fdcd433079d2a9b48007a7c6fbbdf
zsv commit-id:9db4bd2f99dff430485710d303b7435e269e1edc # 12/24/2024
mlas onnxruntime:2c53b4a534a9b64466e435d384c91f0b684ea58a
cppjieba v5.1.0
thrift v0.19.0
Expand Down
4 changes: 4 additions & 0 deletions third_party/zsv/include/zsv.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
#ifndef ZSV_H
#define ZSV_H

//#ifndef ZSV_EXTRAS
//#define ZSV_EXTRAS
//#endif

#include <stddef.h>
#include "zsv/common.h"
#include "zsv/api.h"
Expand Down
49 changes: 21 additions & 28 deletions third_party/zsv/include/zsv/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#define ZSV_ROW_MAX_SIZE_MIN_S "1024"

#define ZSV_MIN_SCANNER_BUFFSIZE 4096
#define ZSV_DEFAULT_SCANNER_BUFFSIZE (1<<18) // 256k
#define ZSV_DEFAULT_SCANNER_BUFFSIZE (1 << 18) // 256k

#include "zsv_export.h"
/*****************************************************************************
Expand All @@ -41,6 +41,9 @@
* - zsv_delete(): dispose the parser
******************************************************************************/

ZSV_EXPORT
int zsv_peek(zsv_parser);

/**
* Create a zsv parser. Typically, passed options will at least include a
* a `row_handler()` callback. Many, but not all, options can be subsequently
Expand Down Expand Up @@ -76,7 +79,6 @@ ZSV_EXPORT enum zsv_status zsv_finish(zsv_parser);
*/
ZSV_EXPORT enum zsv_status zsv_delete(zsv_parser);


/******************************************************************************
* minimal access functions:
* - zsv_cell_count(): get the number of cells in the row
Expand Down Expand Up @@ -151,8 +153,7 @@ const char *zsv_lib_version(void);
* @param parser
* @param row_handler new callback value
*/
ZSV_EXPORT void zsv_set_row_handler(zsv_parser,
void (*row_handler)(void *ctx));
ZSV_EXPORT void zsv_set_row_handler(zsv_parser, void (*row_handler)(void *ctx));

/**
* Check if the row we just parsed consisted entirely of blank data
Expand Down Expand Up @@ -180,8 +181,7 @@ void zsv_set_context(zsv_parser parser, void *ctx);
* @param stream value that is passed to read_function when it is called
*/
ZSV_EXPORT
void zsv_set_read(zsv_parser parser,
size_t (*read_func)(void * __restrict, size_t n, size_t size, void * __restrict));
void zsv_set_read(zsv_parser parser, size_t (*read_func)(void *restrict, size_t n, size_t size, void * __restrict__));

/**
* Set the input stream our parser reads from. If not explicity set, defaults to
Expand All @@ -206,9 +206,7 @@ void zsv_set_input(zsv_parser, void *in);
* does not exceed the bufflen it was passed
*/
ZSV_EXPORT enum zsv_status zsv_set_scan_filter(zsv_parser parser,
size_t (*filter)(void *ctx,
unsigned char *buff,
size_t bufflen),
size_t (*filter)(void *ctx, unsigned char *buff, size_t bufflen),
void *ctx);

/**
Expand All @@ -230,9 +228,7 @@ ZSV_EXPORT enum zsv_status zsv_set_fixed_offsets(zsv_parser parser, size_t count
* the parser buffer!
* @param len length of the input to parse
*/
ZSV_EXPORT enum zsv_status zsv_parse_bytes(zsv_parser parser,
const unsigned char * __restrict buff,
size_t len);
ZSV_EXPORT enum zsv_status zsv_parse_bytes(zsv_parser parser, const unsigned char * __restrict__ buff, size_t len);

/**
* Get a text description of a status code
Expand All @@ -258,6 +254,11 @@ ZSV_EXPORT size_t zsv_scanned_length(zsv_parser);
*/
ZSV_EXPORT size_t zsv_cum_scanned_length(zsv_parser parser);

/**
* @return number of raw bytes scanned from the beginning to the end of this row
*/
ZSV_EXPORT size_t zsv_row_length_raw_bytes(zsv_parser parser);

/**
* Check the quoted status of the last cell that was read. This function is only
* applicable when called from within a cell_handler() callback. Furthermore, this
Expand All @@ -280,23 +281,16 @@ char zsv_quoted(zsv_parser parser);
* Each argument to `zsv_opts_new()` corresponds to the same-named `struct zsv_opts` element
* See common.h for details
*/
ZSV_EXPORT struct zsv_opts *
zsv_opts_new(
void (*row_handler)(void *ctx),
void (*cell_handler)(void *ctx, unsigned char *utf8_value, size_t len),
void *ctx,
zsv_generic_read read,
void *stream,
unsigned char *buff,
size_t buffsize,
unsigned max_columns,
unsigned max_row_size,
char delimiter,
char no_quotes
ZSV_EXPORT struct zsv_opts *zsv_opts_new(void (*row_handler)(void *ctx),
void (*cell_handler)(void *ctx, unsigned char *utf8_value, size_t len),
void *ctx, zsv_generic_read read, void *stream, unsigned char *buff,
size_t buffsize, unsigned max_columns, unsigned max_row_size, char delimiter,
char no_quotes
#ifdef ZSV_EXTRAS
, size_t max_rows
,
size_t max_rows
#endif
);
);

/**
* Destroy an option structure that was created by zsv_opts_new()
Expand All @@ -316,7 +310,6 @@ ZSV_EXPORT void zsv_opts_delete(struct zsv_opts *);
ZSV_EXPORT
enum zsv_status zsv_next_row(zsv_parser parser);


/******************************************************************************
* Miscellaneous functions used by the parser that may have standalone utility
******************************************************************************/
Expand Down
104 changes: 87 additions & 17 deletions third_party/zsv/include/zsv/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
#define ZSV_COMMON_H

#ifdef __cplusplus
# define ZSV_BEGIN_DECL extern "C" {
# define ZSV_END_DECL }
#define ZSV_BEGIN_DECL extern "C" {
#define ZSV_END_DECL }
#else
# define ZSV_BEGIN_DECL
# define ZSV_END_DECL /* empty */
#define ZSV_BEGIN_DECL
#define ZSV_END_DECL /* empty */
#endif

enum zsv_status {
Expand All @@ -27,14 +27,15 @@ enum zsv_status {
zsv_status_row,
zsv_status_done = 100
#ifdef ZSV_EXTRAS
,zsv_status_max_rows_read = 999
,
zsv_status_max_rows_read = 999
#endif
};

/**
* `zsv_parser` is the type of a zsv parser handle
*/
typedef struct zsv_scanner * zsv_parser;
typedef struct zsv_scanner *zsv_parser;

/**
* Structure returned by `zsv_get_cell()` for fetching a parsed CSV cell value
Expand All @@ -53,11 +54,12 @@ struct zsv_cell {
/**
* bitfield values for `quoted` flags
*/
# define ZSV_PARSER_QUOTE_UNCLOSED 1 /* only used internally by parser */
# define ZSV_PARSER_QUOTE_CLOSED 2 /* value was quoted */
# define ZSV_PARSER_QUOTE_NEEDED 4 /* value contains delimiter or dbl-quote */
# define ZSV_PARSER_QUOTE_EMBEDDED 8 /* value contains dbl-quote */
# define ZSV_PARSER_QUOTE_PENDING 16 /* only used internally by parser */
#define ZSV_PARSER_QUOTE_NONE 0 /* content does not need to be quoted */
#define ZSV_PARSER_QUOTE_UNCLOSED 1 /* only used internally by parser */
#define ZSV_PARSER_QUOTE_CLOSED 2 /* value was quoted */
#define ZSV_PARSER_QUOTE_NEEDED 4 /* value contains delimiter or dbl-quote */
#define ZSV_PARSER_QUOTE_EMBEDDED 8 /* value contains dbl-quote */
#define ZSV_PARSER_QUOTE_PENDING 16 /* only used internally by parser */
/**
* quoted flags enable additional efficiency, in particular when input data will
* be output as text (csv, json etc), by indicating whether the cell contents may
Expand All @@ -66,12 +68,14 @@ struct zsv_cell {
* quoting or escaping will be required
*/
char quoted;
unsigned char overwritten : 1;
};

typedef size_t (*zsv_generic_write)(const void * __restrict, size_t, size_t, void * __restrict);
typedef size_t (*zsv_generic_read)(void * __restrict, size_t n, size_t size, void * __restrict);
typedef size_t (*zsv_generic_write)(const void *restrict, size_t, size_t, void *);
typedef size_t (*zsv_generic_read)(void *restrict, size_t n, size_t size, void * __restrict__);
typedef int (*zsv_generic_seek)(void *, long, int);

# ifdef ZSV_EXTRAS
#ifdef ZSV_EXTRAS
/**
* progress callback function signature
* @param context pointer set in parser opts.progress.ctx
Expand All @@ -86,7 +90,31 @@ typedef int (*zsv_progress_callback)(void *ctx, size_t cumulative_row_count);
* @param exit code
*/
typedef void (*zsv_completed_callback)(void *ctx, int code);
# endif

/**
* Data can be "overwritten" on-the-fly by providing custom callbacks
* data from the calling code is passed to the zsv library
* via the `zsv_overwrite_data` structure
*/
struct zsv_overwrite_data {
size_t row_ix; // 0-based
size_t col_ix; // 0-based
size_t timestamp;
struct zsv_cell val;
struct zsv_cell author;
struct zsv_cell old_value;
char have; // 1 = we have unprocessed overwrites
};

struct zsv_opt_overwrite {
void *ctx;
enum zsv_status (*open)(void *ctx);
enum zsv_status (*next)(void *ctx, struct zsv_overwrite_data *odata);
enum zsv_status (*close)(void *ctx);
char cancel; // explicitly cancel application of overwrites
};

#endif

struct zsv_opts {
/**
Expand Down Expand Up @@ -131,6 +159,12 @@ struct zsv_opts {
*/
zsv_generic_read read;

/**
* Caller can specify its own seek function for setting the file position
* with zsv_index_seek. If not specified, the default value is `fseek()`
*/
zsv_generic_seek seek;

/**
* Caller can specify its own stream that is passed to the read function
* If not specified, the default value is stdin
Expand Down Expand Up @@ -230,7 +264,27 @@ struct zsv_opts {
#define ZSV_MALFORMED_UTF8_REMOVE -1
char malformed_utf8_replace;

# ifdef ZSV_EXTRAS
/**
* `overrides` is a bitfield that indicates what ZSV options, if any, were
* specifically set in the command invocation and is used to ensure
* that option values set in the command invocation take priority over
* default values, or values saved in related property values such as
* .zsv/data/<filename>/props.json
*
* For example, if a file has a saved header row span of 2, but the
* command-line arguments explicitly included `--header-row-span 3`,
* then setting header_span to 3 and setting overrides.header_row_span
* ensures that the value of 3 is used
*/
struct {
unsigned char header_row_span : 1;
unsigned char skip_head : 1;
unsigned char max_column_count : 1;
unsigned char malformed_utf8_replacement : 1;
unsigned char _ : 4;
} option_overrides;

#ifdef ZSV_EXTRAS
struct {
/**
* min number of rows between progress callback calls
Expand Down Expand Up @@ -269,7 +323,23 @@ struct zsv_opts {
*/
size_t max_rows;

# endif
/**
* If non-zero, automatically apply overwrites located in
* /path/to/.zsv/data/my-data.csv/overwrite.sqlite3 for a given
* input /path/to/my-data.csv
*
* This flag is only used by zsv_new_with_properties()
* if using zsv_new(), this flag is ignored (use the `overwrite` structure instead)
*/
char overwrite_auto;

/**
* Optional cell-level values that overwrite data returned to the caller by the API
* Use when not using overwrite_auto together with zsv_new_with_properties()
*/
struct zsv_opt_overwrite overwrite;

#endif /* ZSV_EXTRAS */
};

#endif
Loading

0 comments on commit ede4d2d

Please sign in to comment.