From 9f8c8be6ad1badd598ab42dc95902ba94bd6bb22 Mon Sep 17 00:00:00 2001 From: Github Action Date: Wed, 3 Jul 2024 07:05:21 +0000 Subject: [PATCH] doc update for tag `python-v0.18.2` --- 404.html | 83 +- api/catalog/index.html | 83 +- .../delta_table_alterer/index.html | 123 +- api/delta_table/delta_table_merger/index.html | 83 +- .../delta_table_optimizer/index.html | 83 +- api/delta_table/index.html | 143 +- api/delta_table/metadata/index.html | 83 +- api/delta_writer/index.html | 105 +- api/exceptions/index.html | 89 +- api/schema/index.html | 83 +- api/storage/index.html | 83 +- delta-lake-best-practices/index.html | 83 +- delta-lake-big-data-small-data/index.html | 83 +- .../architecture-of-delta-table/index.html | 85 +- .../delta-lake-acid-transactions/index.html | 83 +- .../delta-lake-file-skipping/index.html | 83 +- index.html | 83 +- integrations/delta-lake-arrow/index.html | 91 +- integrations/delta-lake-daft/index.html | 446 ++-- integrations/delta-lake-dagster/index.html | 85 +- integrations/delta-lake-dask/index.html | 109 +- integrations/delta-lake-datafusion/index.html | 85 +- integrations/delta-lake-pandas/index.html | 85 +- integrations/delta-lake-polars/index.html | 85 +- integrations/object-storage/hdfs/index.html | 1982 +++++++++++++++++ objects.inv | Bin 1795 -> 1818 bytes search/search_index.json | 2 +- sitemap.xml | 83 +- sitemap.xml.gz | Bin 606 -> 619 bytes .../index.html | 83 +- usage/constraints/index.html | 83 +- usage/create-delta-lake-table/index.html | 83 +- .../index.html | 85 +- usage/examining-table/index.html | 83 +- usage/installation/index.html | 87 +- usage/loading-table/index.html | 84 +- usage/managing-tables/index.html | 83 +- usage/optimize/delta-lake-z-order/index.html | 83 +- .../index.html | 83 +- usage/overview/index.html | 83 +- usage/querying-delta-tables/index.html | 83 +- usage/read-cdf/index.html | 83 +- usage/writing/index.html | 83 +- .../index.html | 150 +- why-use-delta-lake/index.html | 85 +- 45 files changed, 5622 insertions(+), 376 deletions(-) create mode 100644 integrations/object-storage/hdfs/index.html diff --git a/404.html b/404.html index 5c6f9d488b..6661ace309 100644 --- a/404.html +++ b/404.html @@ -274,8 +274,14 @@ + + + + + +
  • - + @@ -287,6 +293,9 @@ + + + @@ -1291,6 +1300,8 @@ + + @@ -1334,6 +1345,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • diff --git a/api/catalog/index.html b/api/catalog/index.html index a7e3f18f0f..f5f0d2b425 100644 --- a/api/catalog/index.html +++ b/api/catalog/index.html @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1367,6 +1376,8 @@ + + @@ -1410,6 +1421,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • diff --git a/api/delta_table/delta_table_alterer/index.html b/api/delta_table/delta_table_alterer/index.html index a5a5ee2265..90a331151d 100644 --- a/api/delta_table/delta_table_alterer/index.html +++ b/api/delta_table/delta_table_alterer/index.html @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1241,6 +1250,13 @@ drop_constraint +
  • + +
  • + + set_table_properties + +
  • @@ -1371,6 +1387,8 @@ + + @@ -1414,6 +1432,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • @@ -1718,6 +1806,13 @@ drop_constraint +
  • + +
  • + + set_table_properties + +
  • @@ -1951,6 +2046,32 @@

    +
    + + + +

    + set_table_properties + + +

    +
    set_table_properties(properties: Dict[str, str], raise_if_not_exists: bool = True, custom_metadata: Optional[Dict[str, str]] = None) -> None
    +
    + +
    + +

    Unset properties from the table. +Args: + properties: properties which set + raise_if_not_exists: set if should raise if not exists. + custom_metadata: custom metadata that will be added to the transaction commit. +Example:

    + +
    + +
    + + diff --git a/api/delta_table/delta_table_merger/index.html b/api/delta_table/delta_table_merger/index.html index cb3484ffe0..8d03bc62b8 100644 --- a/api/delta_table/delta_table_merger/index.html +++ b/api/delta_table/delta_table_merger/index.html @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1420,6 +1429,8 @@ + + @@ -1463,6 +1474,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • diff --git a/api/delta_table/delta_table_optimizer/index.html b/api/delta_table/delta_table_optimizer/index.html index 025bcee794..b4a1490f76 100644 --- a/api/delta_table/delta_table_optimizer/index.html +++ b/api/delta_table/delta_table_optimizer/index.html @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1371,6 +1380,8 @@ + + @@ -1414,6 +1425,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • diff --git a/api/delta_table/index.html b/api/delta_table/index.html index dafc57aed6..b1e7c00321 100644 --- a/api/delta_table/index.html +++ b/api/delta_table/index.html @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1304,6 +1313,8 @@ + + @@ -1347,6 +1358,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • @@ -1686,6 +1767,13 @@ files +
  • + +
  • + + files_by_partitions + +
  • @@ -2107,7 +2195,7 @@

    -
    create(table_uri: Union[str, Path], schema: Union[pyarrow.Schema, DeltaSchema], mode: Literal['error', 'append', 'overwrite', 'ignore'] = 'error', partition_by: Optional[Union[List[str], str]] = None, name: Optional[str] = None, description: Optional[str] = None, configuration: Optional[Mapping[str, Optional[str]]] = None, storage_options: Optional[Dict[str, str]] = None, custom_metadata: Optional[Dict[str, str]] = None) -> DeltaTable
    +
    create(table_uri: Union[str, Path], schema: Union[pyarrow.Schema, DeltaSchema], mode: Literal['error', 'append', 'overwrite', 'ignore'] = 'error', partition_by: Optional[Union[List[str], str]] = None, name: Optional[str] = None, description: Optional[str] = None, configuration: Optional[Mapping[str, Optional[str]]] = None, storage_options: Optional[Dict[str, str]] = None, custom_metadata: Optional[Dict[str, str]] = None, raise_if_key_not_exists: bool = True) -> DeltaTable
     
    @@ -2235,7 +2323,7 @@

    -

    options passed to the object store crate.

    +

    Options passed to the object store crate.

    @@ -2249,13 +2337,27 @@

    -

    custom metadata that will be added to the transaction commit.

    +

    Custom metadata that will be added to the transaction commit.

    None + + raise_if_key_not_exists + + bool + + +
    +

    Whether to raise an error if the configuration uses keys that are not Delta keys

    +
    + + + True + + @@ -2604,6 +2706,27 @@

    +

    + files_by_partitions + + +

    +
    files_by_partitions(partition_filters: Optional[FilterType]) -> List[str]
    +
    + +
    + +

    Get the files for each partition

    + +
    + +
    + + +
    + + +

    from_data_catalog @@ -3610,7 +3733,7 @@

    -
    to_pandas(partitions: Optional[List[Tuple[str, str, Any]]] = None, columns: Optional[List[str]] = None, filesystem: Optional[Union[str, pa_fs.FileSystem]] = None, filters: Optional[FilterType] = None) -> pd.DataFrame
    +
    to_pandas(partitions: Optional[List[Tuple[str, str, Any]]] = None, columns: Optional[List[str]] = None, filesystem: Optional[Union[str, pa_fs.FileSystem]] = None, filters: Optional[Union[FilterType, Expression]] = None) -> pd.DataFrame
     
    @@ -3675,11 +3798,11 @@

    filters - Optional[FilterType] + Optional[Union[FilterType, Expression]]
    -

    A disjunctive normal form (DNF) predicate for filtering rows. If you pass a filter you do not need to pass partitions

    +

    A disjunctive normal form (DNF) predicate for filtering rows, or directly a pyarrow.dataset.Expression. If you pass a filter you do not need to pass partitions

    @@ -3855,7 +3978,7 @@

    -
    to_pyarrow_table(partitions: Optional[List[Tuple[str, str, Any]]] = None, columns: Optional[List[str]] = None, filesystem: Optional[Union[str, pa_fs.FileSystem]] = None, filters: Optional[FilterType] = None) -> pyarrow.Table
    +
    to_pyarrow_table(partitions: Optional[List[Tuple[str, str, Any]]] = None, columns: Optional[List[str]] = None, filesystem: Optional[Union[str, pa_fs.FileSystem]] = None, filters: Optional[Union[FilterType, Expression]] = None) -> pyarrow.Table
     
    @@ -3920,11 +4043,11 @@

    filters - Optional[FilterType] + Optional[Union[FilterType, Expression]]
    -

    A disjunctive normal form (DNF) predicate for filtering rows. If you pass a filter you do not need to pass partitions

    +

    A disjunctive normal form (DNF) predicate for filtering rows, or directly a pyarrow.dataset.Expression. If you pass a filter you do not need to pass partitions

    diff --git a/api/delta_table/metadata/index.html b/api/delta_table/metadata/index.html index 82a4346015..e7ce141ecb 100644 --- a/api/delta_table/metadata/index.html +++ b/api/delta_table/metadata/index.html @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1399,6 +1408,8 @@ + + @@ -1442,6 +1453,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • diff --git a/api/delta_writer/index.html b/api/delta_writer/index.html index 3792c432bf..27ff6a6993 100644 --- a/api/delta_writer/index.html +++ b/api/delta_writer/index.html @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1391,6 +1400,8 @@ + + @@ -1434,6 +1445,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • @@ -1797,7 +1878,7 @@

    -
    write_deltalake(table_or_uri: Union[str, Path, DeltaTable], data: Union[pd.DataFrame, ds.Dataset, pa.Table, pa.RecordBatch, Iterable[pa.RecordBatch], RecordBatchReader], *, schema: Optional[Union[pa.Schema, DeltaSchema]] = None, partition_by: Optional[Union[List[str], str]] = None, mode: Literal['error', 'append', 'overwrite', 'ignore'] = 'error', file_options: Optional[ds.ParquetFileWriteOptions] = None, max_partitions: Optional[int] = None, max_open_files: int = 1024, max_rows_per_file: int = 10 * 1024 * 1024, min_rows_per_group: int = 64 * 1024, max_rows_per_group: int = 128 * 1024, name: Optional[str] = None, description: Optional[str] = None, configuration: Optional[Mapping[str, Optional[str]]] = None, overwrite_schema: bool = False, schema_mode: Optional[Literal['merge', 'overwrite']] = None, storage_options: Optional[Dict[str, str]] = None, partition_filters: Optional[List[Tuple[str, str, Any]]] = None, predicate: Optional[str] = None, large_dtypes: bool = False, engine: Literal['pyarrow', 'rust'] = 'pyarrow', writer_properties: Optional[WriterProperties] = None, custom_metadata: Optional[Dict[str, str]] = None) -> None
    +
    write_deltalake(table_or_uri: Union[str, Path, DeltaTable], data: Union[pd.DataFrame, ds.Dataset, pa.Table, pa.RecordBatch, Iterable[pa.RecordBatch], RecordBatchReader], *, schema: Optional[Union[pa.Schema, DeltaSchema]] = None, partition_by: Optional[Union[List[str], str]] = None, mode: Literal['error', 'append', 'overwrite', 'ignore'] = 'error', file_options: Optional[ds.ParquetFileWriteOptions] = None, max_partitions: Optional[int] = None, max_open_files: int = 1024, max_rows_per_file: int = 10 * 1024 * 1024, min_rows_per_group: int = 64 * 1024, max_rows_per_group: int = 128 * 1024, name: Optional[str] = None, description: Optional[str] = None, configuration: Optional[Mapping[str, Optional[str]]] = None, schema_mode: Optional[Literal['merge', 'overwrite']] = None, storage_options: Optional[Dict[str, str]] = None, partition_filters: Optional[List[Tuple[str, str, Any]]] = None, predicate: Optional[str] = None, large_dtypes: bool = False, engine: Literal['pyarrow', 'rust'] = 'pyarrow', writer_properties: Optional[WriterProperties] = None, custom_metadata: Optional[Dict[str, str]] = None) -> None
     
    @@ -1807,9 +1888,9 @@

    The pyarrow writer supports protocol version 2 currently and won't be updated. For higher protocol support use engine='rust', this will become the default eventually.

    -

    A locking mechanism is needed to prevent unsafe concurrent writes to a -delta lake directory when writing to S3. For more information on the setup, follow -this usage guide: https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/

    +

    To enable safe concurrent writes when writing to S3, an additional locking +mechanism must be supplied. For more information on enabling concurrent writing to S3, follow +this guide

    @@ -2025,20 +2106,6 @@

    None - - overwrite_schema - - bool - - -
    -

    Deprecated, use schema_mode instead.

    -
    - - - False - - schema_mode diff --git a/api/exceptions/index.html b/api/exceptions/index.html index ad596ff6a4..88e3217821 100644 --- a/api/exceptions/index.html +++ b/api/exceptions/index.html @@ -14,7 +14,7 @@ - + @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1372,6 +1381,8 @@ + + @@ -1415,6 +1426,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • @@ -1947,13 +2028,13 @@

    - +

  • -

    The tranasction log file contains the following information:

    +

    The transaction log file contains the following information:

  • +

    Write to Delta Lake

    +

    You can use write_deltalake to write a Daft DataFrame to a Delta table:

    +
    df.write_deltalake("tmp/daft-table", mode="overwrite")
    +
    +

    Daft supports multiple write modes. See the Daft documentation for more information.

    What can I do with a Daft DataFrame?

    Daft gives you full-featured DataFrame functionality, similar to what you might be used to from pandas, Dask or PySpark.

    On top of this, Daft also gives you:

    @@ -1873,211 +1980,84 @@

    What can I do with a Daft DataFrame
  • Expressions API for easy column transformations
  • UDFs for multi-column transformation, incl. ML applications
  • -

    Let's take a quick look at some of Daft's basic DataFrame operations.

    -

    You can select columns from your DataFrame using the select method. We'll use the show method to show the first n rows (defaults to 10):

    -
    > df.select("first_name", "country").show()
    -
    -|    | first_name   | country   |
    -|---:|:-------------|:----------|
    -|  0 | Ernesto      | Argentina |
    -|  1 | Bruce        | China     |
    -|  2 | Jack         | China     |
    -|  3 | Wolfgang     | Germany   |
    -|  4 | Soraya       | Germany   |
    -
    -

    You can sort your Daft DataFrame using the sort method:

    -
    > df.sort(df["country"], desc=True).show()
    -
    -|    | first_name   | last_name   | country   | continent   |
    -|---:|:-------------|:------------|:----------|:------------|
    -|  0 | Wolfgang     | Manche      | Germany   | NaN         |
    -|  1 | Soraya       | Jala        | Germany   | NaN         |
    -|  2 | Bruce        | Lee         | China     | Asia        |
    -|  3 | Jack         | Ma          | China     | Asia        |
    -|  4 | Ernesto      | Guevara     | Argentina | NaN         |
    -
    -

    You can filter your DataFrame using the where method:

    -
    > df.where(df["continent"] == "Asia").show()
    -
    -|    | first_name   | last_name   | country   | continent   |
    -|---:|:-------------|:------------|:----------|:------------|
    -|  0 | Bruce        | Lee         | China     | Asia        |
    -|  1 | Jack         | Ma          | China     | Asia        |
    -
    -

    You can group your DataFrame by a specific columns using the groupby method. You can then specify the aggregation method, in this case using the count aggregator:

    -
    > df.select("first_name", "country").groupby(df["country"]).count("first_name").show()
    -
    -|    | country   |   first_name |
    -|---:|:----------|-------------:|
    -|  0 | Germany   |            2 |
    -|  1 | China     |            2 |
    -|  2 | Argentina |            1 |
    -

    Check out the Daft User Guide for a complete list of DataFrame operations.

    Data Skipping Optimizations

    -

    You may have noticed the Delta Lake warning at the top when we first called collect on our DataFrame:

    -
    WARNING: has partitioning keys = [PartitionField(country#Utf8)], but no partition filter was specified. This will result in a full table scan.
    - +

    Delta Lake and Daft work together to give you highly-optimized query performance.

    +

    Delta Lake stores your data in Parquet files. Parquet is a columnar row format that natively supports column pruning. If your query only needs to read data from a specific column or set of columns, you don't need to read in the entire file. This can save you lots of time and compute.

    +

    Delta Lake goes beyond the basic Parquet features by also giving you:

    +
      +
    • partitioned reads
    • +
    • file skipping via z-ordering.
    • +
    +

    This is great for Daft users who want to run efficient queries on large-scale data.

    +

    Let's look at how this works.

    +

    Partitioned Reads

    +

    You may have noticed the Delta Lake warning at the top when we first called collect() on our DataFrame:

    +
    +

    WARNING: has partitioning keys = [PartitionField(country#Utf8)], but no partition filter was specified. This will result in a full table scan.

    +

    Delta Lake is informing us that the data is partitioned on the country column.

    -

    Daft's native query optimizer has access to all of the Delta Lake metadata.

    -

    This means it can optimize your query by skipping the partitions that are not relevant for this query. Instead of having to read all 3 partitions, we can read only 1 and get the same result, just faster!

    -
    # Filter on partition columns will result in efficient partition pruning; non-matching partitions will be skipped.
    -> df.where(df["country"] == "Germany").show()
    -
    -|    | first_name   | last_name   | country   |   continent |
    -|---:|:-------------|:------------|:----------|------------:|
    -|  0 | Wolfgang     | Manche      | Germany   |         nan |
    -|  1 | Soraya       | Jala        | Germany   |         nan |
    +

    Daft does some nice magic here to help you out. The Daft query optimizer has access to all of the Delta Lake metadata. This means it can optimize your query by skipping the partitions that are not relevant for this query. Instead of having to read all 3 partitions, we can read only 1 and get the same result, just faster!

    +
    # Filter on partition columns will result in efficient partition pruning; non-matching partitions will be skipped.
    +> df.where(df["country"] == "Germany").show()
    +
    +|    | first_name   | last_name   | country   |   continent |
    +|---:|:-------------|:------------|:----------|------------:|
    +|  0 | Wolfgang     | Manche      | Germany   |         nan |
    +|  1 | Soraya       | Jala        | Germany   |         nan |
     
    -

    You can use the explain method to see how Daft is optimizing your query. Since we've already called collect on our DataFrame, it is already in memory. So below we copy the output of explain(show_all=True) before calling collect:

    +

    You can use the explain() method to see how Daft is optimizing your query.

    +
    +

    Since we've already called collect on our DataFrame, it is already in memory. So below we copy the output of explain(show_all=True) before calling collect:

    +

    Running df.where(df["continent"] == "Asia").explain(True) returns:

    -
    (...)
    -
    -== Optimized Logical Plan ==
    -
    -* PythonScanOperator: DeltaLakeScanOperator(None)
    -|   File schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8
    -|   Partitioning keys = [PartitionField(country#Utf8)]
    -|   Filter pushdown = col(continent) == lit("Asia")
    -|   Output schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8
    -
    -
    -== Physical Plan ==
    -
    -* TabularScan:
    -|   Num Scan Tasks = 3
    -|   Estimated Scan Bytes = 3045
    -|   Clustering spec = { Num partitions = 3 }
    +
    (...)
    +
    +== Optimized Logical Plan ==
    +
    +* PythonScanOperator: DeltaLakeScanOperator(None)
    +|   File schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8
    +|   Partitioning keys = [PartitionField(country#Utf8)]
    +|   Filter pushdown = col(continent) == lit("Asia")
    +|   Output schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8
    +
    +
    +== Physical Plan ==
    +
    +* TabularScan:
    +|   Num Scan Tasks = 3
    +|   Estimated Scan Bytes = 3045
    +|   Clustering spec = { Num partitions = 3 }
     

    Whereas running df.where(df["country"] == "Germany").explain(True) returns:

    -
    (...)
    -
    -== Optimized Logical Plan ==
    -
    -* PythonScanOperator: DeltaLakeScanOperator(None)
    -|   File schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8
    -|   Partitioning keys = [PartitionField(country#Utf8)]
    -|   Partition Filter = col(country) == lit("Germany")
    -|   Output schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8
    -
    -
    -== Physical Plan ==
    -
    -* TabularScan:
    -|   Num Scan Tasks = 1
    -|   Estimated Scan Bytes = 1025
    -|   Clustering spec = { Num partitions = 1 }
    -
    -

    Running a query on a non-partitioned column like continent will require reading in all partitions, totalling 3045 bytes in this case.

    -

    Instead, running a query on a partitioned column (country in this case) means Daft only has to read only the relevant partition, saving us a whopping 2000+ bytes in this toy example :)

    -

    You can read High-Performance Querying on Massive Delta Lake Tables with Daft for an in-depth benchmarking of query optimization with Delta Lake and Daft.

    -

    Transform columns with Expressions

    -

    Daft provides a flexible Expressions API for defining computation that needs to happen over your columns.

    -

    For example, we can use daft.col() expressions together with the with_column method to create a new column full_name, joining the contents of the last_name column to the first_name column:

    -
    > df_full = df.with_column("full_name", daft.col('first_name') + ' ' + daft.col('last_name'))
    -> df_full.show()
    -
    -|    | first_name   | last_name   | country   | continent   | full_name       |
    -|---:|:-------------|:------------|:----------|:------------|:----------------|
    -|  0 | Ernesto      | Guevara     | Argentina | NaN         | Ernesto Guevara |
    -|  1 | Bruce        | Lee         | China     | Asia        | Bruce Lee       |
    -|  2 | Jack         | Ma          | China     | Asia        | Jack Ma         |
    -|  3 | Wolfgang     | Manche      | Germany   | NaN         | Wolfgang Manche |
    -|  4 | Soraya       | Jala        | Germany   | NaN         | Soraya Jala     |
    +
    (...)
    +
    +== Optimized Logical Plan ==
    +
    +* PythonScanOperator: DeltaLakeScanOperator(None)
    +|   File schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8
    +|   Partitioning keys = [PartitionField(country#Utf8)]
    +|   Partition Filter = col(country) == lit("Germany")
    +|   Output schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8
    +
    +
    +== Physical Plan ==
    +
    +* TabularScan:
    +|   Num Scan Tasks = 1
    +|   Estimated Scan Bytes = 1025
    +|   Clustering spec = { Num partitions = 1 }
     
    -

    Multimodal Data Type Support

    +

    Running a query on a non-partitioned column like continent will require reading in all partitions, totalling 3045 bytes in the case of this toy example.

    +

    Instead, running a query on a partitioned column (country in this case) means Daft only has to read only the relevant partition, saving us a ~60% of the compute. This has huge impacts when you're working at scale.

    +

    Z-Ordering for enhanced file skipping

    +

    Z-ordering stores similar data close together to optimize query performance. This is especially useful when you're querying on one or multiple columns.

    +

    Using Z-Ordered Delta tables instead of regular Parquet can give Daft users significant speed-ups.

    +

    Read High-Performance Querying on Massive Delta Lake Tables with Daft for an in-depth benchmarking of query optimization with Delta Lake and Daft using partitioning and Z-ordering.

    +

    Daft gives you Multimodal Data Type Support

    Daft has a rich multimodal type-system with support for Python objects, Images, URLs, Tensors and more.

    -

    Daft columns can contain any Python objects. For example, let's add a column containing a Python class Dog for some of the people in our dataset:

    -
    > import numpy as np
    -
    -> class Dog:
    ->     def __init__(self, name):
    ->         self.name = name
    -
    ->     def bark(self):
    ->         return f"{self.name}!"
    -
    -> df_dogs = daft.from_pydict({
    ->     'full_name': ['Ernesto Guevara','Bruce Lee','Jack Ma','Wolfgang Manche','Soraya Jala'],
    ->     "dogs": [Dog("ruffles"), Dog("shnoodles"), Dog("waffles"), Dog("doofus"), Dog("Fluffles")],
    -> })
    -
    -> df_dogs.show()
    -
    -|    | full_name       | dogs                                 |
    -|---:|:----------------|:-------------------------------------|
    -|  0 | Ernesto Guevara | <__main__.Dog object at 0x1603d1c10> |
    -|  1 | Bruce Lee       | <__main__.Dog object at 0x126ab9b90> |
    -|  2 | Jack Ma         | <__main__.Dog object at 0x1603d27d0> |
    -|  3 | Wolfgang Manche | <__main__.Dog object at 0x1603d1cd0> |
    -|  4 | Soraya Jala     | <__main__.Dog object at 0x1603d3f50> |
    -
    -

    You can join this new dogs column to your existing DataFrame using the join method:

    -
    > df_family = df_full.join(df_dogs, on=["full_name"])
    -> df_family.show()
    -
    -|    | full_name       | first_name   | last_name   | country   | continent   | dogs                                 |
    -|---:|:----------------|:-------------|:------------|:----------|:------------|:-------------------------------------|
    -|  0 | Ernesto Guevara | Ernesto      | Guevara     | Argentina | NaN         | <__main__.Dog object at 0x1603d1c10> |
    -|  1 | Bruce Lee       | Bruce        | Lee         | China     | Asia        | <__main__.Dog object at 0x126ab9b90> |
    -|  2 | Jack Ma         | Jack         | Ma          | China     | Asia        | <__main__.Dog object at 0x1603d27d0> |
    -|  3 | Wolfgang Manche | Wolfgang     | Manche      | Germany   | NaN         | <__main__.Dog object at 0x1603d1cd0> |
    -|  4 | Soraya Jala     | Soraya       | Jala        | Germany   | NaN         | <__main__.Dog object at 0x1603d3f50> |
    -
    -

    We can then use the apply method to apply a function to each instance of the Dog class:

    -
    > from daft import DataType
    -
    -> df_family = df_family.with_column(
    ->     "dogs_bark_name",
    ->     df_family["dogs"].apply(lambda dog: dog.bark(), return_dtype=DataType.string()),
    -> )
    -
    -> df_family.show()
    -
    -|    | first_name   | last_name   | country   | continent   | full_name       | dogs                                 | dogs_bark_name   |
    -|---:|:-------------|:------------|:----------|:------------|:----------------|:-------------------------------------|:-----------------|
    -|  0 | Ernesto      | Guevara     | Argentina | NaN         | Ernesto Guevara | <__main__.Dog object at 0x1603d1c10> | ruffles!         |
    -|  1 | Bruce        | Lee         | China     | Asia        | Bruce Lee       | <__main__.Dog object at 0x126ab9b90> | shnoodles!       |
    -|  2 | Jack         | Ma          | China     | Asia        | Jack Ma         | <__main__.Dog object at 0x1603d27d0> | waffles!         |
    -|  3 | Wolfgang     | Manche      | Germany   | NaN         | Wolfgang Manche | <__main__.Dog object at 0x1603d1cd0> | doofus!          |
    -|  4 | Soraya       | Jala        | Germany   | NaN         | Soraya Jala     | <__main__.Dog object at 0x1603d3f50> | Fluffles!        |
    -
    -

    Daft DataFrames can also contain many other data types, like tensors, JSON, URLs and images. The Expressions API provides useful tools to work with these data types.

    -

    Take a look at the notebook in the delta-examples Github repository for a closer look at how Daft handles URLs, images and ML applications.

    -

    Transform multiple columns with UDFs

    -

    You can use User-Defined Functions (UDFs) to run functions over multiple rows or columns:

    -
    > from daft import udf
    -
    -> @udf(return_dtype=DataType.string())
    -> def custom_bark(dog_series, owner_series):
    ->     return [
    ->         f"{dog.name} loves {owner_name}!"
    ->         for dog, owner_name
    ->         in zip(dog_series.to_pylist(), owner_series.to_pylist())
    ->     ]
    -
    -> df_family = df_family.with_column("custom_bark", custom_bark(df_family["dogs"], df_family["first_name"]))
    -> df_family.select("full_name", "dogs_bark_name", "custom_bark").show()
    -
    -|    | full_name       | dogs_bark_name   | custom_bark            |
    -|---:|:----------------|:-----------------|:-----------------------|
    -|  0 | Ernesto Guevara | ruffles!         | ruffles loves Ernesto! |
    -|  1 | Bruce Lee       | shnoodles!       | shnoodles loves Bruce! |
    -|  2 | Jack Ma         | waffles!         | waffles loves Jack!    |
    -|  3 | Wolfgang Manche | doofus!          | doofus loves Wolfgang! |
    -|  4 | Soraya Jala     | Fluffles!        | Fluffles loves Soraya! |
    -
    -

    Daft supports workloads with many more data types than traditional DataFrame APIs.

    -

    By combining multimodal data support with the UDF functionality you can run ML workloads right within your DataFrame.

    -

    When should I use Daft DataFrames?

    -

    Daft DataFrames are designed for multimodal, distributed workloads.

    -

    You may want to consider using Daft if you're working with:

    -
      -
    1. Large datasets that don't fit into memory or would benefit from parallelization
    2. -
    3. Multimodal data types, such as images, JSON, vector embeddings, and tensors
    4. -
    5. ML workloads that would benefit from interactive computation within DataFrame (via UDFs)
    6. -
    -

    Take a look at the Daft tutorials for in-depth examples of each use case.

    +

    The Expressions API provides useful tools to work with these data types. By combining multimodal data support with the User-Defined Functions API you can run ML workloads right within your DataFrame.

    +

    Take a look at the notebook in the delta-examples Github repository for a closer look at how Daft handles URLs, images and ML applications.

    Contribute to daft

    Excited about Daft and want to contribute? Join them on Github 🚀

    Like many technologies, Daft collects some non-identifiable telemetry to improve the product. This is stricly non-identifiable metadata. You can disable telemetry by setting the following environment variable: DAFT_ANALYTICS_ENABLED=0. Read more in the Daft documentation.

    diff --git a/integrations/delta-lake-dagster/index.html b/integrations/delta-lake-dagster/index.html index 4bb6b04af3..90583f042c 100644 --- a/integrations/delta-lake-dagster/index.html +++ b/integrations/delta-lake-dagster/index.html @@ -287,8 +287,16 @@ + + + + + + + +
  • - + @@ -300,6 +308,9 @@ + + + @@ -1306,6 +1317,8 @@ + + @@ -1347,6 +1360,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • diff --git a/integrations/delta-lake-dask/index.html b/integrations/delta-lake-dask/index.html index 71e6cb2991..e2c87c1f2f 100644 --- a/integrations/delta-lake-dask/index.html +++ b/integrations/delta-lake-dask/index.html @@ -287,8 +287,16 @@ + + + + + + + +
  • - + @@ -300,6 +308,9 @@ + + + @@ -1306,6 +1317,8 @@ + + @@ -1347,6 +1360,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • @@ -1790,11 +1873,13 @@

    Using Delta Lake with Dask

    Delta Lake is a great storage format for Dask analyses. This page explains why and how to use Delta Lake with Dask.

    You will learn how to read Delta Lakes into Dask DataFrames, how to query Delta tables with Dask, and the unique advantages Delta Lake offers the Dask community.

    -

    Here are some of the benefits that Delta Lake provides Dask users: -- better performance with file skipping -- enhanced file skipping via Z Ordering -- ACID transactions for reliable writes -- easy time-travel functionality

    +

    Here are some of the benefits that Delta Lake provides Dask users:

    +
      +
    • better performance with file skipping
    • +
    • enhanced file skipping via Z Ordering
    • +
    • ACID transactions for reliable writes
    • +
    • easy time-travel functionality
    • +

    ❗️ dask-deltatable currently works with deltalake<=13.0. See https://github.com/dask-contrib/dask-deltatable/issues/65

    @@ -1803,7 +1888,7 @@

    Install Dask-Deltatable

    pip install dask-deltatable
     

    Reading Delta Tables into a Dask DataFrame

    -

    You can read data stored in a Delta Lake into a Dask DataFrame using dask-deltatable.read_deltalake.

    +

    You can read data stored in a Delta Lake into a Dask DataFrame using dask-deltatable.read_deltalake.

    Let's read in a toy dataset to see what we can do with Delta Lake and Dask. You can access the data stored as a Delta Lake on Github

    import dask_deltatable as ddt
     
    @@ -1863,7 +1948,7 @@ 

    What can I do with a Dask Deltatab | 1 | Jack | Ma | China | Asia |

    Perform Dask Operations

    -

    Let's perform some basic computations over the Delta Lake data that's now stored in our Dask DataFrame.

    +

    Let's perform some basic computations over the Delta Lake data that's now stored in our Dask DataFrame.

    Suppose you want to group the dataset by the country column:

    > ddf.groupby(['country']).count().compute()
     
    @@ -1873,9 +1958,9 @@ 

    Perform Dask Operations

    | China | 2 | 2 | 2 | | Germany | 2 | 2 | 2 |
    -

    Dask executes this groupby operation in parallel across all available cores.

    +

    Dask executes this groupby operation in parallel across all available cores.

    Map Functions over Partitions

    -

    You can also use Dask's map_partitions method to map a custom Python function over all the partitions.

    +

    You can also use Dask's map_partitions method to map a custom Python function over all the partitions.

    Let's write a function that will replace the missing continent values with the right continent names.

    # define custom python function
     
    @@ -1892,7 +1977,7 @@ 

    Map Functions over Partitions

    partition.loc[partition.country=="Germany"] = partition.loc[partition.country=="Germany"].replace(na_string, "Europe") else: pass - return partition + return partition

    Now map this over all partitions in the Dask DataFrame:

    # define metadata and map function over partitions
    @@ -1910,7 +1995,7 @@ 

    Map Functions over Partitions

    Write to Delta Lake

    After doing your data processing in Dask, you can write the data back out to Delta Lake using to_deltalake:

    -
    ddt.to_deltalake(ddf, "tmp/test_write")
    +
    ddt.to_deltalake("tmp/test_write", ddf)
     

    Contribute to dask-deltalake

    To contribute, go to the dask-deltalake Github repository.

    diff --git a/integrations/delta-lake-datafusion/index.html b/integrations/delta-lake-datafusion/index.html index 61557b4f5e..e2b8f7c224 100644 --- a/integrations/delta-lake-datafusion/index.html +++ b/integrations/delta-lake-datafusion/index.html @@ -287,8 +287,16 @@ + + + + + + + +
  • - + @@ -300,6 +308,9 @@ + + + @@ -1306,6 +1317,8 @@ + + @@ -1347,6 +1360,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • diff --git a/integrations/delta-lake-pandas/index.html b/integrations/delta-lake-pandas/index.html index 0c59b80d84..090d68d054 100644 --- a/integrations/delta-lake-pandas/index.html +++ b/integrations/delta-lake-pandas/index.html @@ -287,8 +287,16 @@ + + + + + + + +
  • - + @@ -300,6 +308,9 @@ + + + @@ -1306,6 +1317,8 @@ + + @@ -1347,6 +1360,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • diff --git a/integrations/delta-lake-polars/index.html b/integrations/delta-lake-polars/index.html index 432f00dd5a..bcfe52b333 100644 --- a/integrations/delta-lake-polars/index.html +++ b/integrations/delta-lake-polars/index.html @@ -287,8 +287,16 @@ + + + + + + + +
  • - + @@ -300,6 +308,9 @@ + + + @@ -1306,6 +1317,8 @@ + + @@ -1347,6 +1360,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • diff --git a/integrations/object-storage/hdfs/index.html b/integrations/object-storage/hdfs/index.html new file mode 100644 index 0000000000..69fbe067b3 --- /dev/null +++ b/integrations/object-storage/hdfs/index.html @@ -0,0 +1,1982 @@ + + + + + + + + + + + + + + + + + + + + + + + + + HDFS Storage Backend - Delta Lake Documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + + + + + + +
    + +
    + + + + +
    +
    + + + +
    +
    +
    + + + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    + + + + + + + +

    HDFS Storage Backend

    +

    HDFS support is provided via the hdfs-native-object-store package, which sits on top of hdfs-native. This is an HDFS client written from scratch in Rust, with no bindings to libhdfs or any use of Java. While it supports most common cluster configurations, it does not support every possible client configuration that could exist.

    +

    Supported Configurations

    +

    By default, the client looks for existing Hadoop configs in following manner: +- If the HADOOP_CONF_DIR environment variable is defined, load configs from $HADOOP_CONF_DIR/core-site.xml and $HADOOP_CONF_DIR/hdfs-site.xml +- Otherwise, if the HADOOP_HOME environment variable is set, load configs from $HADOOP_HOME/etc/hadoop/core-site.xml and $HADOOP_HOME/etc/hadoop/hdfs-site.xml

    +

    Additionally, you can pass Hadoop configs as storage_options and these will take precedence over the above configs.

    +

    Currently the supported client configuration parameters are: +- dfs.ha.namenodes.* - name service support +- dfs.namenode.rpc-address.* - name service support +- fs.viewfs.mounttable.*.link.* - ViewFS links +- fs.viewfs.mounttable.*.linkFallback - ViewFS link fallback

    +

    If you find your setup is not supported, please file an issue in the hdfs-native repository.

    +

    Secure Clusters

    +

    The client supports connecting to secure clusters through both Kerberos authentication as well as token authentication, and all SASL protection types are supported. The highest supported protection mechanism advertised by the server will be used.

    +

    Kerberos Support

    +

    Kerberos is supported through dynamically loading the libgssapi_krb5 library. This must be installed separately through your package manager, and currently only works on Linux and Mac.

    +

    Debian-based systems: +

    apt-get install libgssapi-krb5-2
    +

    +

    RHEL-based systems: +

    yum install krb5-libs
    +

    +

    MacOS: +

    brew install krb5
    +

    +

    Then simply kinit to get your TGT and authentication to HDFS should just work.

    +

    Token Support

    +

    Token authentication is supported by looking for a token file located at the environment variable HADOOP_TOKEN_FILE_LOCATION. This is the location systems like YARN will automatically place a delegation token, so things will just work inside of YARN jobs.

    +

    Issues

    +

    If you face any HDFS-specific issues, please report to the hdfs-native-object-store repository.

    + + + + + + +
    +
    + + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + \ No newline at end of file diff --git a/objects.inv b/objects.inv index aa64e828ffdf814d981618f8fbfba91860ad3430..ff71318ca2cd3cc7d8921bb1e0950c74233afa5e 100644 GIT binary patch delta 1653 zcmV-*28#KE4w?>-xPK*TI5U)cNsL{-&≀pBm+=aI%m;)b3OF*Mt0CsH^($dsfJ% zx%%x&*7`%6|4gvqH(LtwpJv%%zhZoeuiRn1v*jEET1A zTNKA2LP~u6FZ#Y|vefRg^vAyZ4_^g4A2jM`rur$qnOdAb4G_Xk{icX# zF_8l7yYV29;Bkovd(AdyYn?C@0}#Y9H`heTl#mR_ApTAh2$eR>mC#+I_suS(R?JtyN#tt$UB>(*`4!+#TA zgHR{ju`JKm zl>S&hbrZ$hp2QrLX&B~^j-PFb5Pxh4^jRLk9Omm~%2p`d3#84Gn~}&-1%kocXROZx z0$VP#q|ru$6vPLzsImA_3gW|A)>wzLI%>i=$-Dukd1vG{7zp<8c z251;c6vrAJwV(l%Nscv|GvbEHZg{Kp#B`G3B?h!zCb46k24w`EFty^Q)PF@5JQjZv zg7{#bJl5hQ1}y@a^jL#Y8o@6!?eY9U3mODc@v;17xgA{K!>RgMhqF3r0%Y@3+-4Z8 z?K1P9Vd~=BslYn|Sff#jz^}eTK(o5QfVP464X{?D6vij40PBZi)u=*wgr^2@WTL@AwL`*VSM@i*B8^JTSB)Pu|DV%a^Go(hg9AQaSZ2FhAyl#ZbB zUxl%`vfJ_3-;25qWUc`3Q%%u*adtw?@M%1?2CiUG9Z(5co)xlb zu710cwf>NIzX;d;^x>CZuin4QBch!woU)x%I%i4BtWdIQYf&mEbLnIUd2Fl9X;aZa zR!OJWa(_{+kU#1|iPq|76=8+ino@M_Co=y}VV>BeC*_39bAKUwDZfU~x^c!H*!sdq zUx`NSm2EohD$XX57ww+AHdaoT{75O4eOjYl*-cFCYp-7wtFzYDIFp%z?QT^qwOt|a zVv65-P4=uxe%m=CYFXv7Sw%QVYt2jFSsUfHK)K_<%>uP`zGmPYZt<&8t2tS9k<*>LQ!tPH)c)G(5Wf?toDu1Mg>!{V$URo_H*Px>@U{mzxEN;8UKi?=RXiNgUS~PVC>9qquOlI47mHoA(@~5t zS#?JfeSgx56$(HId^BaL-Dl}Hdifu|3U=OU)Xz-yLwqx}IA4tr;!gddh-Wd80_?kS zBe25b5)t>BEzXu6!cYuA5c}L*lO$6@G9rWc!P*bzo{B@vi+LR8^+iF}eC3$n9g9Ou z%1CmlNKom|DLGz5R;LN#@&IuwYtmeLEDzFn^M6e!Mwl>0=BOUF`k@Y_X*$rsc*Zm~ zmnIEXG*mJmPX!rGglL|afZv)WIodap=5?^1;z^LpVD0&U0$ zG=PG!?n>(`1|Xh5C7m8;HG_cBhxltq<_S=^i=u6uAP8^0w{%f9BXm!jFnI7i>&fO z1^QLO0!?zgq5+SEcB}qwRF#`ai&(4zsHm|Z%(wp8x_|hMCQ1 zqd^MdBbm=w{3r$S@yuwf!&#j);TxpjU~V9*P#QMYa?St^WBJ-xqoWoyfD*c~ zMsr5maJ3uWYB@igWO#`IZNpr0tka-Oz*mfLxREVLM#th$LJ%KKSI1hM#DAbgB%vK^ zFiI2nFyS4~AGDxBBqJWnZ^BhsfRAU)V;#=wqzRBzPjQ=Ju(n|WKEu?dS2dATeXP+a zMc@}x`Ds=c7|=G7_K&q1r7(WQ46r=-hiT^5Kn`;xJ$)IukbO?$&mfRnEJdAaHe~=_ zu{3s~)f53ZO)}ky20Dkhk$>DdG&xA7o3WSkWo`0PD-QYPSyiHx&QHm? zKz+)(+6DF5*hVh?i4||F$7xDDf;Z`-0%PhB<2x>x_t>8+k$w^JID z2tAs61i=)fIM_E8oBS4_*(%^vd%wHvj+t diff --git a/search/search_index.json b/search/search_index.json index 41fcf6028c..7538db3d7e 100644 --- a/search/search_index.json +++ b/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"The deltalake package","text":"

    This is the documentation for the native Rust/Python implementation of Delta Lake. It is based on the delta-rs Rust library and requires no Spark or JVM dependencies. For the PySpark implementation, see delta-spark instead.

    This module provides the capability to read, write, and manage Delta Lake tables with Python or Rust without Spark or Java. It uses Apache Arrow under the hood, so is compatible with other Arrow-native or integrated libraries such as pandas, DuckDB, and Polars.

    "},{"location":"#important-terminology","title":"Important terminology","text":"
    • \"Rust deltalake\" refers to the Rust API of delta-rs (no Spark dependency)
    • \"Python deltalake\" refers to the Python API of delta-rs (no Spark dependency)
    • \"Delta Spark\" refers to the Scala implementation of the Delta Lake transaction log protocol. This depends on Spark and Java.
    "},{"location":"#why-implement-the-delta-lake-transaction-log-protocol-in-rust-and-scala","title":"Why implement the Delta Lake transaction log protocol in Rust and Scala?","text":"

    Delta Spark depends on Java and Spark, which is fine for many use cases, but not all Delta Lake users want to depend on these libraries. delta-rs allows using Delta Lake in Rust or other native projects when using a JVM is often not an option.

    Python deltalake lets you query Delta tables without depending on Java/Scala.

    Suppose you want to query a Delta table with pandas on your local machine. Python deltalake makes it easy to query the table with a simple pip install command - no need to install Java.

    "},{"location":"#contributing","title":"Contributing","text":"

    The Delta Lake community welcomes contributors from all developers, regardless of your experience or programming background.

    You can write Rust code, Python code, documentation, submit bugs, or give talks to the community. We welcome all of these contributions.

    Feel free to join our Slack and message us in the #delta-rs channel any time!

    We value kind communication and building a productive, friendly environment for maximum collaboration and fun.

    "},{"location":"#project-history","title":"Project history","text":"

    Check out this video by Denny Lee & QP Hou to learn about the genesis of the delta-rs project:

    "},{"location":"delta-lake-best-practices/","title":"Delta Lake Best Practices","text":"

    This page outlines Delta Lake best practices.

    You should consider several factors to optimize the performance of your Delta tables and minimize costs.

    The Delta Lake best practices depend on your data ingestion into the Delta table and query patterns. You must understand your data and how users run queries to best leverage Delta Lake.

    "},{"location":"delta-lake-best-practices/#compacting-small-files","title":"Compacting small files","text":"

    Delta tables work best when the files are \u201cright-sized\u201d. Files that are too small create I/O overhead. Files that are too large limit the impact of file skipping (a critical query optimization).

    Delta tables can accumulate a lot of small files, especially if you\u2019re frequently writing small amounts of data. If your table has many small files, you should run a small compaction operation to consolidate all the tiny files into \u201cright-sized\u201d files.

    It\u2019s generally best for files in a Delta table to be between 100MB and 1GB, but that can vary based on the overall size of the table and the query patterns.

    Delta Lake makes it easy to compact the small files.

    "},{"location":"delta-lake-best-practices/#optimizing-table-layout","title":"Optimizing table layout","text":"

    You can colocate similar data in the same files to make file skipping more effective. Delta Lake supports Z Ordering, which can colocate similar data in the same files.

    Z Ordering can yield impressive performance gains for low-cardinality columns but also works well for high-cardinality columns. This is an advantage compared to Hive-style partitioning, which is only suitable for low-cardinality columns.

    You must analyze the most common query patterns and Z Order your dataset based on the columns allowing the most file skipping. The ability to colocate data in the same files decreases when you add more Z Order columns.

    Let\u2019s look at Hive-style partitioning, another way to colocate data in the same files. You can also use Hive-style partitioning in conjunction with Z Ordering.

    "},{"location":"delta-lake-best-practices/#partitioning-datasets","title":"Partitioning datasets","text":"

    You can partition your Delta tables, which separates the data by one or more partition keys into separate folders. Partitioning can be an excellent performance optimization (when you filter on the partition key) and is a good way to sidestep concurrency conflict issues.

    Hive-style partitioning also has some significant downsides.

    • It\u2019s only suitable for low-cardinality columns.
    • It can create many small files, especially if you use the wrong partition key or frequently update the Delta table.
    • It can cause some queries that don\u2019t rely on the partition key to run slower (because of the excessive number of small files). A large number of small files is problematic for I/O throughput.

    Hive-style partitioning can be a great data management tactic and a fantastic option for many Delta tables. Beware of the downsides before partitioning your tables.

    You can use Hive-style partitioning in conjunction with Z Ordering. You can partition a table by one column and Z Order by another. They\u2019re different tactics that aim to help you skip more files and run queries faster.

    Let\u2019s look at some of the built-in Delta features that help maintain the integrity of your tables.

    "},{"location":"delta-lake-best-practices/#use-appropriate-quality-controls","title":"Use appropriate quality controls","text":"

    Delta Lake supports schema enforcement and column constraints to protect the integrity of your data.

    Delta Lake enabled schema enforcement by default, so you can only append data to an existing table with the same exact schema. You can bypass schema enforcement by enabling schema evolution, which allows you to append mismatched schemas to a table.

    You should only enable schema evolution when you want to allow the schema of your table to change. You should not enable schema evolution if you don\u2019t want this flexibility. Schema enforcement is a good default setting.

    Column-level constraints prevent you from appending data that fail SQL predicates. For example, you may add a constraint that requires all the values in the age column of a table to be positive.

    You should add column constraints to your table whenever you want a column only to include values that satisfy a predicate.

    No data is appended when you apply a constraint and a row check fails. For example, if you try to append 100 rows of data to a table and one row has a failing check, then no data is added.

    When you have column constraints, it\u2019s often a good idea to append the failing data to a \u201cquarantine table\u201d and the passing data to the main Delta table. Or you can filter out the failing rows and just append the passing rows. Keeping a history of the failing rows in a quarantine table is helpful for debugging.

    See here to learn more about Delta Lake constraints.

    "},{"location":"delta-lake-best-practices/#best-practices-for-dml-operations","title":"Best practices for DML operations","text":"

    DML operations like deleting, updating, and merging write existing data in new files and mark existing files for deletion in the transaction log. Rewriting data files is expensive, so you want to minimize the number of rewritten files when you run DML operations.

    Delta Lake supports a table feature called deletion vectors that implements DML transactions more efficiently under the hood. Enabling deletion vectors is usually the best way to make DML operations run faster. Note: delta-rs doesn\u2019t support deletion vectors yet.

    You should periodically purge deletion vectors because they can accumulate and slow subsequent read operations. Once you enable the feature, you must purge the deletion vectors in your table with an appropriate cadence.

    "},{"location":"delta-lake-best-practices/#use-vacuum-to-save-storage-costs","title":"Use vacuum to save storage costs","text":"

    Delta Lake supports transactions, which necessitates keeping old versions of data in storage, even the files marked for removal in the transactions log.

    Keeping old versions of Delta tables in storage is often desirable because it allows for versioned data, time travel, and rolling back tables to a previous state.

    If you don\u2019t want to leverage older versions of a table, then you should remove the legacy files from storage with the vacuum command. Vacuum will remove all files older than the table retention period and marked for removal in the transaction log.

    You only need to vacuum when you perform operations that mark files for removal in the transaction log. An append-only table doesn\u2019t create legacy files that need to be vacuumed.

    Create a good vacuum strategy for your tables to minimize your storage costs.

    "},{"location":"delta-lake-best-practices/#delta-lake-best-practices-to-minimize-costs","title":"Delta Lake best practices to minimize costs","text":"

    Delta Lake helps you minimize costs in many ways:

    • It's a free, open source format (based on Parquet). It's not a proprietary format that you need to pay for.
    • Delta tables store column-level min/max values in the transaction log, allowing file skipping.
    • Delta tables can be optimized (small file compaction, Z Ordering, etc.), so your queries run faster. When your queries run faster, then you pay less on compute.
    • Deletion vectors let you perform DML operations (delete, update, merge) much faster. If your delete operation runs 100x faster, then you pay 100x less compute.
    • It's easy to remove legacy files from storage with VACUUM, which minimizes storage costs.

    You should understand your organization\u2019s query patterns and use these features to minimize the overall cost. You need to assess tradeoffs. For example, Z Ordering is a computation that costs money, but it can save you lots of money in the long run if all your subsequent queries run a lot faster and use less compute.

    "},{"location":"delta-lake-best-practices/#collect-metadata-stats-on-columns-used-for-file-skipping","title":"Collect metadata stats on columns used for file skipping","text":"

    Delta tables don\u2019t always store each column's min/max values. Some Delta Lake implementations only store min/max values for the first 32 columns in the table, for example.

    Delta Lake can only apply file-skipping when it has min/max values for the relevant columns stored in the transaction log. Suppose you\u2019re running a filtering operation on col_a, for example. Delta Lake can only apply file skipping when the transaction log stores col_a min/max metadata.

    Ensure the transaction log stores metadata stats for all the columns that benefit from file skipping.

    "},{"location":"delta-lake-best-practices/#dont-collect-column-metadata-when-its-unnecessary","title":"Don\u2019t collect column metadata when it\u2019s unnecessary","text":"

    It takes some time to compute column statistics when writing files, and it isn\u2019t worth the effort if you cannot use the column for file skipping.

    Suppose you have a table column containing a long string of arbitrary text. It\u2019s unlikely that this column would ever provide any data-skipping benefits. So, you can just avoid the overhead of collecting the statistics for this particular column.

    "},{"location":"delta-lake-best-practices/#additional-reading","title":"Additional reading","text":"

    Delta Lake relies on transactions, and you should check out this page to learn more.

    Many Delta Lake performance benefits rely on file skipping, which you should understand well to get the most out of Delta.

    "},{"location":"delta-lake-best-practices/#conclusion","title":"Conclusion","text":"

    Delta Lake is a powerful technology that makes your data pipelines more reliable, saves money, and makes everyday data processing tasks easy.

    You need to learn how Delta Lake works at a high level to leverage Delta's power fully. You will not be able to leverage Delta Lake\u2019s full performance potential if your table has improperly sized files or if you\u2019re not colocating data in the same files to maximize data skipping, for example.

    Luckily, there are only a few details that are important to learn. You don\u2019t need to know the implementation details - just the essential high-level concepts.

    "},{"location":"delta-lake-big-data-small-data/","title":"Delta Lake for big data and small data","text":"

    Delta Lake is an excellent storage format for big data and small data.

    This post explains why Delta Lake is suitable for massive datasets and why many of these features that are great, even for tiny tables. Delta Lake is fine for a table with less than 1 GB of data or hundreds of petabytes of data.

    Let\u2019s start by discussing the features that are great for small data.

    "},{"location":"delta-lake-big-data-small-data/#delta-lake-for-small-data-tables","title":"Delta Lake for small data tables","text":"

    Delta Lake has many features that are useful for small datasets:

    • Reliable transactions
    • Better performance via file skipping
    • DML operations to make deletes, updates, and merges easy and performant
    • Features like schema enforcement and constraints to enforce data quality
    • Versioned data & time travel

    All of these features are great for large and small tables.

    Delta Lake DML operations are ACID transactions, so they either finish entirely or don\u2019t finish at all. Delta tables don\u2019t require any downtime while DML operations are running. The Delta Lake user experience is better than a data lake that doesn\u2019t support transactions and has downtime while running DML operations.

    The Delta Lake API also makes it easy to run DML operations. You can delete a line of code from a Delta table with a single line of code. Writing code to delete rows from CSV files is more challenging, especially if you want to implement this operation efficiently.

    Delta Lake has built-in checks to retain the integrity of your tables. For example, Delta tables have schema enforcement and prevent you from appending DataFrames with mismatched schema from the existing table. Delta Lake also lets you add constraints that only allow appending specific values to a column. Data quality is also essential for small tables!

    Delta Lake splits data into multiple files with file-level metadata in the transaction log, so query engines can sometimes skip data. Data skipping can be a huge performance benefit, depending on how much data can be ignored by the query engine.

    As previously mentioned, Delta tables record all DML operations as transactions. Recording operations as transactions means that existing data isn\u2019t mutated. So Delta Lake provides versioned data and time travel out of the box. Versioning data is better because it allows you to roll back mistakes and compare the state of the table at different points in time.

    Delta Lake has many useful features for small data tables. Let\u2019s look at how Delta Lake is scalable for massive datasets.

    "},{"location":"delta-lake-big-data-small-data/#delta-lake-for-large-data-tables","title":"Delta Lake for large data tables","text":"

    Delta Lake is designed to be scalable and can handle tables with terabytes or petabytes of data.

    See here for an example of an organization ingesting 220 TB of data into a Delta table daily.

    Delta tables store data in Parquet files, and cloud object stores allow engines to write any number of files. Delta tables store metadata information in the transaction log as JSON files, which are periodically compacted into Parquet files, so an arbitrarily large amount of Delta table metadata can also be stored.

    Delta Lake transactions and concurrency protection maintain the integrity of tables, even for large write operations or long-running computations.

    It\u2019s well known that Delta tables are scalable, even for the most enormous tables.

    "},{"location":"delta-lake-big-data-small-data/#small-data-operations-on-large-tables","title":"Small data operations on large tables","text":"

    Delta Lake is flexible and allows you to use \u201csmall data engines,\u201d even for large tables, depending on the computation.

    Suppose you have a Delta table containing 10 TB of data and a pipeline that appends 0.5 GB of data to the table every hour. You don\u2019t need a big data query engine to append a small amount of data. You can set up this job to run the Delta table append with a small data engine like pandas or Polars.

    Delta tables are flexible and interoperable with many technologies so that you can use the right tool for each data processing job. This allows you to design pipelines how you\u2019d like and minimize costs.

    "},{"location":"delta-lake-big-data-small-data/#when-delta-lake-isnt-needed","title":"When Delta Lake isn\u2019t needed","text":"

    You don\u2019t need Delta Lake for a small dataset that never changes and can be stored in a single Parquet file.

    Suppose you have a 0.5 GB dataset in a Parquet file that never needs to be updated. You can just keep that data in a Parquet table. Reading the metadata from the Parquet footer of a single file isn\u2019t expensive. You won\u2019t be taking advantage of Delta Lake's features like transactions, convenient DML operations, or versioned data.

    But in most cases, it\u2019s best to use Delta Lake because its features protect the integrity of your tables and make your life easier.

    "},{"location":"delta-lake-big-data-small-data/#conclusion","title":"Conclusion","text":"

    Delta Lake is well known for being scalable to huge tables but is also an excellent technology for small tables.

    Delta Lake is a lightweight technology, so there is little overhead. Writing the metadata file after performing a transaction is fast. It\u2019s a minuscule cost, considering the benefits you receive.

    Many reasons that make Delta Lake better than data lakes for large tables also apply to small tables!

    "},{"location":"why-use-delta-lake/","title":"Why use Delta Lake","text":"

    This page explains why Delta Lake is a better storage format for most tabular data analyses than data lake alternatives.

    Delta Lake provides developer-friendly features, reliable transactions, and fast performance compared with alternatives like Parquet or CSV.

    "},{"location":"why-use-delta-lake/#fast-performance","title":"Fast performance","text":"

    Delta tables store data in Parquet files and persist file-level metadata in the transaction log.

    This offers two main performance advantages:

    • File skipping based on metadata that\u2019s quickly accessible
    • Easy identification of all file paths for the table, compared to file listing operations that can be slow, especially on cloud object stores

    Delta Lake stores min/max values for each column of each file in the table. Certain queries can skip entire files based on the metadata. File skipping can be a massive performance optimization.

    Delta Lake also makes it easy to rearrange data in the table, so more file skipping is possible. For example, the table can be partitioned or Z Ordered, so that similar data is colocated in the same files and data skipping is optimal for your query patterns.

    For data lakes, you need to run file listing operations to get the file paths before you can actually read the data. Listing all the files in a data lake can take a long time, especially if there are a lot of files and they are stored in Hive-style partitions.

    Delta Lake stores all the file paths in the transaction log. So you can quickly get the file paths directly from the log and then run your query. Delta Lake also stores the file-level metadata in the transaction log which is quicker than opening all the files in the data lake and grabbing the metadata from the file footer.

    "},{"location":"why-use-delta-lake/#developer-friendly-features","title":"Developer friendly features","text":"

    Many basic data operations are hard in data lakes but quite easy with Delta Lake. The only data operation that\u2019s easy with in data lake is appending data. Delta Lake makes all data operations easy including the following:

    • Appends
    • Upserts
    • Deletes
    • Replace where

    Even deleting a few rows of data from a data lake is hard. It\u2019s even harder if you want to run the operation in a performant manner.

    Delta Lake makes it easy to run common data operations and executes them performantly under the hood.

    Delta Lake also executes write operations as transactions, which makes data operations safer and prevents downtime. Write operations will cause data lakes to be in an unstable state while the computations is running. For example, if you read a data lake while a delete operation is running, then you may get the wrong data.

    Let\u2019s explore the benefits of reliable transactions in more detail.

    "},{"location":"why-use-delta-lake/#reliable-transactions","title":"Reliable transactions","text":"

    Delta Lake supports transactions which means that write operations have the following characteristics:

    • They either finish completely or don\u2019t run at all
    • They are executed in a serial manner and don\u2019t conflict with other transactions
    • They don\u2019t corrupt a table or violate table constraints

    Data lakes don\u2019t support transactions, so the write operations can cause the following errors:

    • There is no schema enforcement, so you can append data to a data lake with a mismatching schema
    • Reading the data lake often yields incorrect results while write transactions are performed
    • Data lakes can be corrupted for invalid write operations or computations that error-out
    • Concurrent transactions that conflict can cause data loss

    Production data systems should rely on storage systems like Delta Lake that support transactions.

    "},{"location":"why-use-delta-lake/#interoperability","title":"Interoperability","text":"

    Delta Lake tables are interoperable and can be read/written by multiple different query engines.

    For example, you can create a Delta table with Spark, append to it with pandas, and then read it with Polars.

    Delta tables are powerful because they are interoperable with various query engines and computation runtimes.

    Suppose you have a Delta table that\u2019s updated with an AWS Lambda function every 5 minutes. There is only a small amount of data collected every 5 minutes, so a lightweight runtime like AWS Lambda is sufficient.

    Further suppose that the overall table is quite large. So when you want to perform DML operations or query the whole table, your team uses a Spark cluster.

    Delta Lake is flexible to allow these types of operations from multiple readers and writers. This provides teams with the flexibility to choose the right tool for the job.

    "},{"location":"why-use-delta-lake/#support-for-many-languages","title":"Support for many languages","text":"

    Delta tables can be queried with a variety of different languages. This project provides APIs for Rust and Python users and does not depend on Java or Scala. This project is a great alternative for pandas, Polars, DuckDB, or DataFusion.

    Delta Lake supports many languages and even more language support is coming soon!

    "},{"location":"why-use-delta-lake/#support-on-multiple-clouds","title":"Support on multiple clouds","text":"

    Delta Lake supports multiple clouds including GCP, AWS, and Azure.

    You can also use Delta Lake on your local machine or in an on-prem environment.

    Delta Lake is quite portable.

    "},{"location":"why-use-delta-lake/#conclusion","title":"Conclusion","text":"

    Delta Lake is a mature table format that offers users tons of advantages over a data lake with virtually no downsides.

    Once you start using Delta Lake, you will never want to go back to data lakes that expose you to a variety of dangerous bugs, poor performance, and reliability issues.

    The Delta Lake community is also welcome and open. We gladly accept new contributors and help users with their questions.

    "},{"location":"api/catalog/","title":"Catalog","text":"","boost":2},{"location":"api/catalog/#deltalake.data_catalog.DataCatalog","title":"deltalake.data_catalog.DataCatalog","text":"

    Bases: Enum

    List of the Data Catalogs

    ","boost":2},{"location":"api/catalog/#deltalake.data_catalog.DataCatalog.AWS","title":"AWS class-attribute instance-attribute","text":"
    AWS = 'glue'\n

    Refers to the AWS Glue Data Catalog <https://docs.aws.amazon.com/glue/latest/dg/catalog-and-crawler.html>_

    ","boost":2},{"location":"api/catalog/#deltalake.data_catalog.DataCatalog.UNITY","title":"UNITY class-attribute instance-attribute","text":"
    UNITY = 'unity'\n

    Refers to the Databricks Unity Catalog <https://docs.databricks.com/data-governance/unity-catalog/index.html>_

    ","boost":2},{"location":"api/delta_writer/","title":"Writer","text":"","boost":10},{"location":"api/delta_writer/#write-to-delta-tables","title":"Write to Delta Tables","text":"","boost":10},{"location":"api/delta_writer/#deltalake.write_deltalake","title":"deltalake.write_deltalake","text":"
    write_deltalake(table_or_uri: Union[str, Path, DeltaTable], data: Union[pd.DataFrame, ds.Dataset, pa.Table, pa.RecordBatch, Iterable[pa.RecordBatch], RecordBatchReader], *, schema: Optional[Union[pa.Schema, DeltaSchema]] = None, partition_by: Optional[Union[List[str], str]] = None, mode: Literal['error', 'append', 'overwrite', 'ignore'] = 'error', file_options: Optional[ds.ParquetFileWriteOptions] = None, max_partitions: Optional[int] = None, max_open_files: int = 1024, max_rows_per_file: int = 10 * 1024 * 1024, min_rows_per_group: int = 64 * 1024, max_rows_per_group: int = 128 * 1024, name: Optional[str] = None, description: Optional[str] = None, configuration: Optional[Mapping[str, Optional[str]]] = None, overwrite_schema: bool = False, schema_mode: Optional[Literal['merge', 'overwrite']] = None, storage_options: Optional[Dict[str, str]] = None, partition_filters: Optional[List[Tuple[str, str, Any]]] = None, predicate: Optional[str] = None, large_dtypes: bool = False, engine: Literal['pyarrow', 'rust'] = 'pyarrow', writer_properties: Optional[WriterProperties] = None, custom_metadata: Optional[Dict[str, str]] = None) -> None\n

    Write to a Delta Lake table

    If the table does not already exist, it will be created.

    The pyarrow writer supports protocol version 2 currently and won't be updated. For higher protocol support use engine='rust', this will become the default eventually.

    A locking mechanism is needed to prevent unsafe concurrent writes to a delta lake directory when writing to S3. For more information on the setup, follow this usage guide: https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/

    Parameters:

    Name Type Description Default table_or_uri Union[str, Path, DeltaTable]

    URI of a table or a DeltaTable object.

    required data Union[DataFrame, Dataset, Table, RecordBatch, Iterable[RecordBatch], RecordBatchReader]

    Data to write. If passing iterable, the schema must also be given.

    required schema Optional[Union[Schema, Schema]]

    Optional schema to write.

    None partition_by Optional[Union[List[str], str]]

    List of columns to partition the table by. Only required when creating a new table.

    None mode Literal['error', 'append', 'overwrite', 'ignore']

    How to handle existing data. Default is to error if table already exists. If 'append', will add new data. If 'overwrite', will replace table with new data. If 'ignore', will not write anything if table already exists.

    'error' file_options Optional[ParquetFileWriteOptions]

    Optional write options for Parquet (ParquetFileWriteOptions). Can be provided with defaults using ParquetFileWriteOptions().make_write_options(). Please refer to https://github.com/apache/arrow/blob/master/python/pyarrow/_dataset_parquet.pyx#L492-L533 for the list of available options. Only used in pyarrow engine.

    None max_partitions Optional[int]

    the maximum number of partitions that will be used. Only used in pyarrow engine.

    None max_open_files int

    Limits the maximum number of files that can be left open while writing. If an attempt is made to open too many files then the least recently used file will be closed. If this setting is set too low you may end up fragmenting your data into many small files. Only used in pyarrow engine.

    1024 max_rows_per_file int

    Maximum number of rows per file. If greater than 0 then this will limit how many rows are placed in any single file. Otherwise there will be no limit and one file will be created in each output directory unless files need to be closed to respect max_open_files min_rows_per_group: Minimum number of rows per group. When the value is set, the dataset writer will batch incoming data and only write the row groups to the disk when sufficient rows have accumulated. Only used in pyarrow engine.

    10 * 1024 * 1024 max_rows_per_group int

    Maximum number of rows per group. If the value is set, then the dataset writer may split up large incoming batches into multiple row groups. If this value is set, then min_rows_per_group should also be set.

    128 * 1024 name Optional[str]

    User-provided identifier for this table.

    None description Optional[str]

    User-provided description for this table.

    None configuration Optional[Mapping[str, Optional[str]]]

    A map containing configuration options for the metadata action.

    None overwrite_schema bool

    Deprecated, use schema_mode instead.

    False schema_mode Optional[Literal['merge', 'overwrite']]

    If set to \"overwrite\", allows replacing the schema of the table. Set to \"merge\" to merge with existing schema.

    None storage_options Optional[Dict[str, str]]

    options passed to the native delta filesystem.

    None predicate Optional[str]

    When using Overwrite mode, replace data that matches a predicate. Only used in rust engine.

    None partition_filters Optional[List[Tuple[str, str, Any]]]

    the partition filters that will be used for partition overwrite. Only used in pyarrow engine.

    None large_dtypes bool

    If True, the data schema is kept in large_dtypes, has no effect on pandas dataframe input.

    False engine Literal['pyarrow', 'rust']

    writer engine to write the delta table. Rust engine is still experimental but you may see up to 4x performance improvements over pyarrow.

    'pyarrow' writer_properties Optional[WriterProperties]

    Pass writer properties to the Rust parquet writer.

    None custom_metadata Optional[Dict[str, str]]

    Custom metadata to add to the commitInfo.

    None","boost":10},{"location":"api/delta_writer/#deltalake.WriterProperties","title":"deltalake.WriterProperties dataclass","text":"
    WriterProperties(data_page_size_limit: Optional[int] = None, dictionary_page_size_limit: Optional[int] = None, data_page_row_count_limit: Optional[int] = None, write_batch_size: Optional[int] = None, max_row_group_size: Optional[int] = None, compression: Optional[Literal['UNCOMPRESSED', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD', 'LZ4_RAW']] = None, compression_level: Optional[int] = None)\n

    A Writer Properties instance for the Rust parquet writer.

    Create a Writer Properties instance for the Rust parquet writer:

    Parameters:

    Name Type Description Default data_page_size_limit Optional[int]

    Limit DataPage size to this in bytes.

    None dictionary_page_size_limit Optional[int]

    Limit the size of each DataPage to store dicts to this amount in bytes.

    None data_page_row_count_limit Optional[int]

    Limit the number of rows in each DataPage.

    None write_batch_size Optional[int]

    Splits internally to smaller batch size.

    None max_row_group_size Optional[int]

    Max number of rows in row group.

    None compression Optional[Literal['UNCOMPRESSED', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD', 'LZ4_RAW']]

    compression type.

    None compression_level Optional[int]

    If none and compression has a level, the default level will be used, only relevant for GZIP: levels (1-9), BROTLI: levels (1-11), ZSTD: levels (1-22),

    None","boost":10},{"location":"api/delta_writer/#convert-to-delta-tables","title":"Convert to Delta Tables","text":"","boost":10},{"location":"api/delta_writer/#deltalake.convert_to_deltalake","title":"deltalake.convert_to_deltalake","text":"
    convert_to_deltalake(uri: Union[str, Path], mode: Literal['error', 'ignore'] = 'error', partition_by: Optional[pa.Schema] = None, partition_strategy: Optional[Literal['hive']] = None, name: Optional[str] = None, description: Optional[str] = None, configuration: Optional[Mapping[str, Optional[str]]] = None, storage_options: Optional[Dict[str, str]] = None, custom_metadata: Optional[Dict[str, str]] = None) -> None\n

    Convert parquet tables to delta tables.

    Currently only HIVE partitioned tables are supported. Convert to delta creates a transaction log commit with add actions, and additional properties provided such as configuration, name, and description.

    Parameters:

    Name Type Description Default uri Union[str, Path]

    URI of a table.

    required partition_by Optional[Schema]

    Optional partitioning schema if table is partitioned.

    None partition_strategy Optional[Literal['hive']]

    Optional partition strategy to read and convert

    None mode Literal['error', 'ignore']

    How to handle existing data. Default is to error if table already exists. If 'ignore', will not convert anything if table already exists.

    'error' name Optional[str]

    User-provided identifier for this table.

    None description Optional[str]

    User-provided description for this table.

    None configuration Optional[Mapping[str, Optional[str]]]

    A map containing configuration options for the metadata action.

    None storage_options Optional[Dict[str, str]]

    options passed to the native delta filesystem. Unused if 'filesystem' is defined.

    None custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit

    None","boost":10},{"location":"api/exceptions/","title":"Exceptions","text":"","boost":2},{"location":"api/exceptions/#deltalake.exceptions.DeltaError","title":"deltalake.exceptions.DeltaError","text":"

    Bases: builtins.Exception

    The base class for Delta-specific errors.

    ","boost":2},{"location":"api/exceptions/#deltalake.exceptions.DeltaProtocolError","title":"deltalake.exceptions.DeltaProtocolError","text":"

    Bases: _internal.DeltaError

    Raised when a violation with the Delta protocol specs ocurred.

    ","boost":2},{"location":"api/exceptions/#deltalake.exceptions.TableNotFoundError","title":"deltalake.exceptions.TableNotFoundError","text":"

    Bases: _internal.DeltaError

    Raised when a Delta table cannot be loaded from a location.

    ","boost":2},{"location":"api/exceptions/#deltalake.exceptions.CommitFailedError","title":"deltalake.exceptions.CommitFailedError","text":"

    Bases: _internal.DeltaError

    Raised when a commit to a Delta table fails.

    ","boost":2},{"location":"api/schema/","title":"Schema","text":"","boost":2},{"location":"api/schema/#schema-and-field","title":"Schema and field","text":"

    Schemas, fields, and data types are provided in the deltalake.schema submodule.

    ","boost":2},{"location":"api/schema/#deltalake.Schema","title":"deltalake.Schema","text":"
    Schema(fields: List[Field])\n

    Bases: deltalake._internal.StructType

    A Delta Lake schema

    Create using a list of :class:Field:

    Schema([Field(\"x\", \"integer\"), Field(\"y\", \"string\")]) Schema([Field(x, PrimitiveType(\"integer\"), nullable=True), Field(y, PrimitiveType(\"string\"), nullable=True)])

    Or create from a PyArrow schema:

    import pyarrow as pa Schema.from_pyarrow(pa.schema({\"x\": pa.int32(), \"y\": pa.string()})) Schema([Field(x, PrimitiveType(\"integer\"), nullable=True), Field(y, PrimitiveType(\"string\"), nullable=True)])

    ","boost":2},{"location":"api/schema/#deltalake.Schema.invariants","title":"invariants","text":"
    invariants: List[Tuple[str, str]] = <attribute 'invariants' of 'deltalake._internal.Schema' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.Schema.from_json","title":"from_json staticmethod","text":"
    from_json(schema_json) -> Schema\n

    Create a new Schema from a JSON string.

    Parameters:

    Name Type Description Default json str

    a JSON string

    required Example

    A schema has the same JSON format as a StructType.

    Schema.from_json('''{\n    \"type\": \"struct\",\n    \"fields\": [{\"name\": \"x\", \"type\": \"integer\", \"nullable\": true, \"metadata\": {}}]\n    }\n)'''\n# Returns Schema([Field(x, PrimitiveType(\"integer\"), nullable=True)])\n

    ","boost":2},{"location":"api/schema/#deltalake.Schema.from_pyarrow","title":"from_pyarrow staticmethod","text":"
    from_pyarrow(data_type) -> Schema\n

    Create a Schema from a PyArrow Schema type

    Will raise TypeError if the PyArrow type is not a primitive type.

    Parameters:

    Name Type Description Default type Schema

    A PyArrow Schema

    required

    Returns:

    Type Description Schema

    a Schema

    ","boost":2},{"location":"api/schema/#deltalake.Schema.to_json","title":"to_json method descriptor","text":"
    to_json() -> str\n

    Get the JSON string representation of the Schema.

    Returns:

    Type Description str

    a JSON string

    Example

    A schema has the same JSON format as a StructType.

    Schema([Field(\"x\", \"integer\")]).to_json()\n# Returns '{\"type\":\"struct\",\"fields\":[{\"name\":\"x\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}'\n

    ","boost":2},{"location":"api/schema/#deltalake.Schema.to_pyarrow","title":"to_pyarrow method descriptor","text":"
    to_pyarrow(as_large_types: bool = False) -> pyarrow.Schema\n

    Return equivalent PyArrow schema

    Parameters:

    Name Type Description Default as_large_types bool

    get schema with all variable size types (list, binary, string) as large variants (with int64 indices). This is for compatibility with systems like Polars that only support the large versions of Arrow types.

    False

    Returns:

    Type Description Schema

    a PyArrow Schema

    ","boost":2},{"location":"api/schema/#deltalake.Field","title":"deltalake.Field","text":"
    Field(name: str, type: DataType, *, nullable: bool = True, metadata: Optional[Dict[str, Any]] = None)\n
    ","boost":2},{"location":"api/schema/#deltalake.Field.metadata","title":"metadata","text":"
    metadata: Dict[str, Any] = <attribute 'metadata' of 'deltalake._internal.Field' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.Field.name","title":"name","text":"
    name: str = <attribute 'name' of 'deltalake._internal.Field' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.Field.nullable","title":"nullable","text":"
    nullable: bool = <attribute 'nullable' of 'deltalake._internal.Field' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.Field.type","title":"type","text":"
    type: DataType = <attribute 'type' of 'deltalake._internal.Field' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.Field.from_json","title":"from_json staticmethod","text":"
    from_json(field_json) -> Field\n

    Create a Field from a JSON string.

    Parameters:

    Name Type Description Default json str

    the JSON string.

    required

    Returns:

    Type Description Field

    Field

    Example
    Field.from_json('''{\n        \"name\": \"col\",\n        \"type\": \"integer\",\n        \"nullable\": true,\n        \"metadata\": {}\n    }'''\n)\n# Returns Field(col, PrimitiveType(\"integer\"), nullable=True)\n
    ","boost":2},{"location":"api/schema/#deltalake.Field.from_pyarrow","title":"from_pyarrow staticmethod","text":"
    from_pyarrow(field: pyarrow.Field) -> Field\n

    Create a Field from a PyArrow field Note: This currently doesn't preserve field metadata.

    Parameters:

    Name Type Description Default field Field

    a PyArrow Field

    required

    Returns:

    Type Description Field

    a Field

    ","boost":2},{"location":"api/schema/#deltalake.Field.to_json","title":"to_json method descriptor","text":"
    to_json() -> str\n

    Get the field as JSON string.

    Returns:

    Type Description str

    a JSON string

    Example
    Field(\"col\", \"integer\").to_json()\n# Returns '{\"name\":\"col\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}'\n
    ","boost":2},{"location":"api/schema/#deltalake.Field.to_pyarrow","title":"to_pyarrow method descriptor","text":"
    to_pyarrow() -> pyarrow.Field\n

    Convert to an equivalent PyArrow field Note: This currently doesn't preserve field metadata.

    Returns:

    Type Description Field

    a pyarrow Field

    ","boost":2},{"location":"api/schema/#data-types","title":"Data types","text":"","boost":2},{"location":"api/schema/#deltalake.schema.PrimitiveType","title":"deltalake.schema.PrimitiveType","text":"
    PrimitiveType(data_type: str)\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.PrimitiveType.type","title":"type","text":"
    type: str = <attribute 'type' of 'deltalake._internal.PrimitiveType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.PrimitiveType.from_json","title":"from_json staticmethod","text":"
    from_json(type_json) -> PrimitiveType\n

    Create a PrimitiveType from a JSON string

    The JSON representation for a primitive type is just a quoted string: PrimitiveType.from_json('\"integer\"')

    Parameters:

    Name Type Description Default json str

    a JSON string

    required

    Returns:

    Type Description PrimitiveType

    a PrimitiveType type

    ","boost":2},{"location":"api/schema/#deltalake.schema.PrimitiveType.from_pyarrow","title":"from_pyarrow staticmethod","text":"
    from_pyarrow(data_type) -> PrimitiveType\n

    Create a PrimitiveType from a PyArrow datatype

    Will raise TypeError if the PyArrow type is not a primitive type.

    Parameters:

    Name Type Description Default type DataType

    A PyArrow DataType

    required

    Returns:

    Type Description PrimitiveType

    a PrimitiveType

    ","boost":2},{"location":"api/schema/#deltalake.schema.PrimitiveType.to_pyarrow","title":"to_pyarrow method descriptor","text":"
    to_pyarrow() -> pyarrow.DataType\n

    Get the equivalent PyArrow type (pyarrow.DataType)

    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType","title":"deltalake.schema.ArrayType","text":"
    ArrayType(element_type: DataType, *, contains_null: bool = True)\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType.contains_null","title":"contains_null","text":"
    contains_null: bool = <attribute 'contains_null' of 'deltalake._internal.ArrayType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType.element_type","title":"element_type","text":"
    element_type: DataType = <attribute 'element_type' of 'deltalake._internal.ArrayType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType.type","title":"type","text":"
    type: Literal['array'] = <attribute 'type' of 'deltalake._internal.ArrayType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType.from_json","title":"from_json staticmethod","text":"
    from_json(type_json) -> ArrayType\n

    Create an ArrayType from a JSON string

    Parameters:

    Name Type Description Default json str

    a JSON string

    required

    Returns:

    Type Description ArrayType

    an ArrayType

    Example

    The JSON representation for an array type is an object with type (set to \"array\"), elementType, and containsNull.

    ArrayType.from_json(\n    '''{\n        \"type\": \"array\",\n        \"elementType\": \"integer\",\n        \"containsNull\": false\n    }'''\n)\n# Returns ArrayType(PrimitiveType(\"integer\"), contains_null=False)\n

    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType.from_pyarrow","title":"from_pyarrow staticmethod","text":"
    from_pyarrow(data_type) -> ArrayType\n

    Create an ArrayType from a pyarrow.ListType.

    Will raise TypeError if a different PyArrow DataType is provided.

    Parameters:

    Name Type Description Default type ListType

    The PyArrow ListType

    required

    Returns:

    Type Description ArrayType

    an ArrayType

    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType.to_json","title":"to_json method descriptor","text":"
    to_json() -> str\n

    Get the JSON string representation of the type.

    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType.to_pyarrow","title":"to_pyarrow method descriptor","text":"
    to_pyarrow() -> pyarrow.ListType\n

    Get the equivalent PyArrow type.

    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType","title":"deltalake.schema.MapType","text":"
    MapType(key_type: DataType, value_type: DataType, *, value_contains_null: bool = True)\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.key_type","title":"key_type","text":"
    key_type: DataType = <attribute 'key_type' of 'deltalake._internal.MapType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.type","title":"type","text":"
    type: Literal['map'] = <attribute 'type' of 'deltalake._internal.MapType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.value_contains_null","title":"value_contains_null","text":"
    value_contains_null: bool = <attribute 'value_contains_null' of 'deltalake._internal.MapType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.value_type","title":"value_type","text":"
    value_type: DataType = <attribute 'value_type' of 'deltalake._internal.MapType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.from_json","title":"from_json staticmethod","text":"
    from_json(type_json) -> MapType\n

    Create a MapType from a JSON string

    Parameters:

    Name Type Description Default json str

    a JSON string

    required

    Returns:

    Type Description MapType

    an ArrayType

    Example

    The JSON representation for a map type is an object with type (set to map), keyType, valueType, and valueContainsNull:

    MapType.from_json(\n    '''{\n        \"type\": \"map\",\n        \"keyType\": \"integer\",\n        \"valueType\": \"string\",\n        \"valueContainsNull\": true\n    }'''\n)\n# Returns MapType(PrimitiveType(\"integer\"), PrimitiveType(\"string\"), value_contains_null=True)\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.from_pyarrow","title":"from_pyarrow staticmethod","text":"
    from_pyarrow(data_type) -> MapType\n

    Create a MapType from a PyArrow MapType.

    Will raise TypeError if passed a different type.

    Parameters:

    Name Type Description Default type MapType

    the PyArrow MapType

    required

    Returns:

    Type Description MapType

    a MapType

    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.to_json","title":"to_json method descriptor","text":"
    to_json() -> str\n

    Get JSON string representation of map type.

    Returns:

    Type Description str

    a JSON string

    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.to_pyarrow","title":"to_pyarrow method descriptor","text":"
    to_pyarrow() -> pyarrow.MapType\n

    Get the equivalent PyArrow data type.

    ","boost":2},{"location":"api/schema/#deltalake.schema.StructType","title":"deltalake.schema.StructType","text":"
    StructType(fields: List[Field])\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.StructType.fields","title":"fields","text":"
    fields: List[Field] = <attribute 'fields' of 'deltalake._internal.StructType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.StructType.type","title":"type","text":"
    type: Literal['struct'] = <attribute 'type' of 'deltalake._internal.StructType' objects>\n

    The string \"struct\"

    ","boost":2},{"location":"api/schema/#deltalake.schema.StructType.from_json","title":"from_json staticmethod","text":"
    from_json(type_json) -> StructType\n

    Create a new StructType from a JSON string.

    Parameters:

    Name Type Description Default json str

    a JSON string

    required

    Returns:

    Type Description StructType

    a StructType

    Example
    StructType.from_json(\n    '''{\n        \"type\": \"struct\",\n        \"fields\": [{\"name\": \"x\", \"type\": \"integer\", \"nullable\": true, \"metadata\": {}}]\n    }'''\n)\n# Returns StructType([Field(x, PrimitiveType(\"integer\"), nullable=True)])\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.StructType.from_pyarrow","title":"from_pyarrow staticmethod","text":"
    from_pyarrow(data_type) -> StructType\n

    Create a new StructType from a PyArrow struct type.

    Will raise TypeError if a different data type is provided.

    Parameters:

    Name Type Description Default type StructType

    a PyArrow struct type.

    required

    Returns:

    Type Description StructType

    a StructType

    ","boost":2},{"location":"api/schema/#deltalake.schema.StructType.to_json","title":"to_json method descriptor","text":"
    to_json() -> str\n

    Get the JSON representation of the type.

    Returns:

    Type Description str

    a JSON string

    Example
    StructType([Field(\"x\", \"integer\")]).to_json()\n# Returns '{\"type\":\"struct\",\"fields\":[{\"name\":\"x\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}'\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.StructType.to_pyarrow","title":"to_pyarrow method descriptor","text":"
    to_pyarrow() -> pyarrow.StructType\n

    Get the equivalent PyArrow StructType

    Returns:

    Type Description StructType

    a PyArrow StructType

    ","boost":2},{"location":"api/storage/","title":"Storage","text":"

    The delta filesystem handler for the pyarrow engine writer.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler","title":"deltalake.fs.DeltaStorageHandler","text":"
    DeltaStorageHandler(table_uri: str, options: Optional[Dict[str, str]] = None, known_sizes: Optional[Dict[str, int]] = None)\n

    Bases: FileSystemHandler

    DeltaStorageHandler is a concrete implementations of a PyArrow FileSystemHandler.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.copy_file","title":"copy_file","text":"
    copy_file(src: str, dst: str) -> None\n

    Copy a file.

    If the destination exists and is a directory, an error is returned. Otherwise, it is replaced.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.create_dir","title":"create_dir","text":"
    create_dir(path: str, recursive: bool = True) -> None\n

    Create a directory and subdirectories.

    This function succeeds if the directory already exists.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.delete_dir","title":"delete_dir","text":"
    delete_dir(path: str) -> None\n

    Delete a directory and its contents, recursively.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.delete_dir_contents","title":"delete_dir_contents","text":"
    delete_dir_contents(path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False) -> None\n

    Delete a directory's contents, recursively.

    Like delete_dir, but doesn't delete the directory itself.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.delete_file","title":"delete_file","text":"
    delete_file(path: str) -> None\n

    Delete a file.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.delete_root_dir_contents","title":"delete_root_dir_contents","text":"
    delete_root_dir_contents() -> None\n

    Delete the root directory contents, recursively.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.get_file_info","title":"get_file_info","text":"
    get_file_info(paths: List[str]) -> List[FileInfo]\n

    Get info for the given files.

    A non-existing or unreachable file returns a FileStat object and has a FileType of value NotFound. An exception indicates a truly exceptional condition (low-level I/O error, etc.).

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.get_file_info_selector","title":"get_file_info_selector","text":"
    get_file_info_selector(selector: FileSelector) -> List[FileInfo]\n

    Get info for the files defined by FileSelector.

    Parameters:

    Name Type Description Default selector FileSelector

    FileSelector object

    required

    Returns:

    Type Description List[FileInfo]

    list of file info objects

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.move","title":"move","text":"
    move(src: str, dest: str) -> None\n

    Move / rename a file or directory.

    If the destination exists: - if it is a non-empty directory, an error is returned - otherwise, if it has the same type as the source, it is replaced - otherwise, behavior is unspecified (implementation-dependent).

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.normalize_path","title":"normalize_path","text":"
    normalize_path(path: str) -> str\n

    Normalize filesystem path.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.open_input_file","title":"open_input_file","text":"
    open_input_file(path: str) -> pa.PythonFile\n

    Open an input file for random access reading.

    Parameters:

    Name Type Description Default path str

    The source to open for reading.

    required

    Returns:

    Type Description PythonFile

    NativeFile

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.open_input_stream","title":"open_input_stream","text":"
    open_input_stream(path: str) -> pa.PythonFile\n

    Open an input stream for sequential reading.

    Parameters:

    Name Type Description Default path str

    The source to open for reading.

    required

    Returns:

    Type Description PythonFile

    NativeFile

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.open_output_stream","title":"open_output_stream","text":"
    open_output_stream(path: str, metadata: Optional[Dict[str, str]] = None) -> pa.PythonFile\n

    Open an output stream for sequential writing.

    If the target already exists, existing data is truncated.

    Parameters:

    Name Type Description Default path str

    The source to open for writing.

    required metadata Optional[Dict[str, str]]

    If not None, a mapping of string keys to string values.

    None

    Returns:

    Type Description PythonFile

    NativeFile

    ","boost":2},{"location":"api/delta_table/","title":"DeltaTable","text":"","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable","title":"deltalake.DeltaTable dataclass","text":"
    DeltaTable(table_uri: Union[str, Path, os.PathLike[str]], version: Optional[int] = None, storage_options: Optional[Dict[str, str]] = None, without_files: bool = False, log_buffer_size: Optional[int] = None)\n

    Represents a Delta Table

    Create the Delta Table from a path with an optional version. Multiple StorageBackends are currently supported: AWS S3, Azure Data Lake Storage Gen2, Google Cloud Storage (GCS) and local URI. Depending on the storage backend used, you could provide options values using the storage_options parameter.

    Parameters:

    Name Type Description Default table_uri Union[str, Path, PathLike[str]]

    the path of the DeltaTable

    required version Optional[int]

    version of the DeltaTable

    None storage_options Optional[Dict[str, str]]

    a dictionary of the options to use for the storage backend

    None without_files bool

    If True, will load table without tracking files. Some append-only applications might have no need of tracking any files. So, the DeltaTable will be loaded with a significant memory reduction.

    False log_buffer_size Optional[int]

    Number of files to buffer when reading the commit log. A positive integer. Setting a value greater than 1 results in concurrent calls to the storage api. This can decrease latency if there are many files in the log since the last checkpoint, but will also increase memory usage. Possible rate limits of the storage backend should also be considered for optimal performance. Defaults to 4 * number of cpus.

    None","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.alter","title":"alter property","text":"
    alter: TableAlterer\n

    Namespace for all table alter related methods.

    Returns:

    Name Type Description TableAlterer TableAlterer

    TableAlterer Object

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.optimize","title":"optimize property","text":"
    optimize: TableOptimizer\n

    Namespace for all table optimize related methods.

    Returns:

    Name Type Description TableOptimizer TableOptimizer

    TableOptimizer Object

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.cleanup_metadata","title":"cleanup_metadata","text":"
    cleanup_metadata() -> None\n

    Delete expired log files before current version from table. The table log retention is based on the configuration.logRetentionDuration value, 30 days by default.

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.create","title":"create classmethod","text":"
    create(table_uri: Union[str, Path], schema: Union[pyarrow.Schema, DeltaSchema], mode: Literal['error', 'append', 'overwrite', 'ignore'] = 'error', partition_by: Optional[Union[List[str], str]] = None, name: Optional[str] = None, description: Optional[str] = None, configuration: Optional[Mapping[str, Optional[str]]] = None, storage_options: Optional[Dict[str, str]] = None, custom_metadata: Optional[Dict[str, str]] = None) -> DeltaTable\n

    CREATE or CREATE_OR_REPLACE a delta table given a table_uri.

    Parameters:

    Name Type Description Default table_uri Union[str, Path]

    URI of a table

    required schema Union[Schema, Schema]

    Table schema

    required mode Literal['error', 'append', 'overwrite', 'ignore']

    How to handle existing data. Default is to error if table already exists. If 'append', returns not support error if table exists. If 'overwrite', will CREATE_OR_REPLACE table. If 'ignore', will not do anything if table already exists. Defaults to \"error\".

    'error' partition_by Optional[Union[List[str], str]]

    List of columns to partition the table by.

    None name Optional[str]

    User-provided identifier for this table.

    None description Optional[str]

    User-provided description for this table.

    None configuration Optional[Mapping[str, Optional[str]]]

    A map containing configuration options for the metadata action.

    None storage_options Optional[Dict[str, str]]

    options passed to the object store crate.

    None custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns:

    Name Type Description DeltaTable DeltaTable

    created delta table

    Example
    import pyarrow as pa\n\nfrom deltalake import DeltaTable\n\ndt = DeltaTable.create(\n    table_uri=\"my_local_table\",\n    schema=pa.schema(\n        [pa.field(\"foo\", pa.string()), pa.field(\"bar\", pa.string())]\n    ),\n    mode=\"error\",\n    partition_by=\"bar\",\n)\n
    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.delete","title":"delete","text":"
    delete(predicate: Optional[str] = None, writer_properties: Optional[WriterProperties] = None, custom_metadata: Optional[Dict[str, str]] = None) -> Dict[str, Any]\n

    Delete records from a Delta Table that statisfy a predicate.

    When a predicate is not provided then all records are deleted from the Delta Table. Otherwise a scan of the Delta table is performed to mark any files that contain records that satisfy the predicate. Once files are determined they are rewritten without the records.

    Parameters:

    Name Type Description Default predicate Optional[str]

    a SQL where clause. If not passed, will delete all rows.

    None writer_properties Optional[WriterProperties]

    Pass writer properties to the Rust parquet writer.

    None custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns:

    Type Description Dict[str, Any]

    the metrics from delete.

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.file_uris","title":"file_uris","text":"
    file_uris(partition_filters: Optional[List[Tuple[str, str, Any]]] = None) -> List[str]\n

    Get the list of files as absolute URIs, including the scheme (e.g. \"s3://\").

    Local files will be just plain absolute paths, without a scheme. (That is, no 'file://' prefix.)

    Use the partition_filters parameter to retrieve a subset of files that match the given filters.

    Parameters:

    Name Type Description Default partition_filters Optional[List[Tuple[str, str, Any]]]

    the partition filters that will be used for getting the matched files

    None

    Returns:

    Type Description List[str]

    list of the .parquet files with an absolute URI referenced for the current version of the DeltaTable

    Predicates are expressed in disjunctive normal form (DNF), like [(\"x\", \"=\", \"a\"), ...]. DNF allows arbitrary boolean logical combinations of single partition predicates. The innermost tuples each describe a single partition predicate. The list of inner predicates is interpreted as a conjunction (AND), forming a more selective and multiple partition predicates. Each tuple has format: (key, op, value) and compares the key with the value. The supported op are: =, !=, in, and not in. If the op is in or not in, the value must be a collection such as a list, a set or a tuple. The supported type for value is str. Use empty string '' for Null partition value.

    Example
    (\"x\", \"=\", \"a\")\n(\"x\", \"!=\", \"a\")\n(\"y\", \"in\", [\"a\", \"b\", \"c\"])\n(\"z\", \"not in\", [\"a\",\"b\"])\n
    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.files","title":"files","text":"
    files(partition_filters: Optional[List[Tuple[str, str, Any]]] = None) -> List[str]\n

    Get the .parquet files of the DeltaTable.

    The paths are as they are saved in the delta log, which may either be relative to the table root or absolute URIs.

    Parameters:

    Name Type Description Default partition_filters Optional[List[Tuple[str, str, Any]]]

    the partition filters that will be used for getting the matched files

    None

    Returns:

    Type Description List[str]

    list of the .parquet files referenced for the current version of the DeltaTable

    Predicates are expressed in disjunctive normal form (DNF), like [(\"x\", \"=\", \"a\"), ...]. DNF allows arbitrary boolean logical combinations of single partition predicates. The innermost tuples each describe a single partition predicate. The list of inner predicates is interpreted as a conjunction (AND), forming a more selective and multiple partition predicates. Each tuple has format: (key, op, value) and compares the key with the value. The supported op are: =, !=, in, and not in. If the op is in or not in, the value must be a collection such as a list, a set or a tuple. The supported type for value is str. Use empty string '' for Null partition value.

    Example
    (\"x\", \"=\", \"a\")\n(\"x\", \"!=\", \"a\")\n(\"y\", \"in\", [\"a\", \"b\", \"c\"])\n(\"z\", \"not in\", [\"a\",\"b\"])\n
    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.from_data_catalog","title":"from_data_catalog classmethod","text":"
    from_data_catalog(data_catalog: DataCatalog, database_name: str, table_name: str, data_catalog_id: Optional[str] = None, version: Optional[int] = None, log_buffer_size: Optional[int] = None) -> DeltaTable\n

    Create the Delta Table from a Data Catalog.

    Parameters:

    Name Type Description Default data_catalog DataCatalog

    the Catalog to use for getting the storage location of the Delta Table

    required database_name str

    the database name inside the Data Catalog

    required table_name str

    the table name inside the Data Catalog

    required data_catalog_id Optional[str]

    the identifier of the Data Catalog

    None version Optional[int]

    version of the DeltaTable

    None log_buffer_size Optional[int]

    Number of files to buffer when reading the commit log. A positive integer. Setting a value greater than 1 results in concurrent calls to the storage api. This can decrease latency if there are many files in the log since the last checkpoint, but will also increase memory usage. Possible rate limits of the storage backend should also be considered for optimal performance. Defaults to 4 * number of cpus.

    None","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.get_add_actions","title":"get_add_actions","text":"
    get_add_actions(flatten: bool = False) -> pyarrow.RecordBatch\n

    Return a dataframe with all current add actions.

    Add actions represent the files that currently make up the table. This data is a low-level representation parsed from the transaction log.

    Parameters:

    Name Type Description Default flatten bool

    whether to flatten the schema. Partition values columns are given the prefix partition., statistics (null_count, min, and max) are given the prefix null_count., min., and max., and tags the prefix tags.. Nested field names are concatenated with ..

    False

    Returns:

    Type Description RecordBatch

    a PyArrow RecordBatch containing the add action data.

    Example
    from pprint import pprint\nfrom deltalake import DeltaTable, write_deltalake\nimport pyarrow as pa\ndata = pa.table({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\nwrite_deltalake(\"tmp\", data, partition_by=[\"x\"])\ndt = DeltaTable(\"tmp\")\ndf = dt.get_add_actions().to_pandas()\ndf[\"path\"].sort_values(ignore_index=True)\n0    x=1/0\n1    x=2/0\n2    x=3/0\n
    df = dt.get_add_actions(flatten=True).to_pandas()\ndf[\"partition.x\"].sort_values(ignore_index=True)\n0    1\n1    2\n2    3\n
    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.history","title":"history","text":"
    history(limit: Optional[int] = None) -> List[Dict[str, Any]]\n

    Run the history command on the DeltaTable. The operations are returned in reverse chronological order.

    Parameters:

    Name Type Description Default limit Optional[int]

    the commit info limit to return

    None

    Returns:

    Type Description List[Dict[str, Any]]

    list of the commit infos registered in the transaction log

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.load_as_version","title":"load_as_version","text":"
    load_as_version(version: Union[int, str, datetime]) -> None\n

    Load/time travel a DeltaTable to a specified version number, or a timestamp version of the table. If a string is passed then the argument should be an RFC 3339 and ISO 8601 date and time string format. If a datetime object without a timezone is passed, the UTC timezone will be assumed.

    Parameters:

    Name Type Description Default version Union[int, str, datetime]

    the identifier of the version of the DeltaTable to load

    required Example

    Use a version number

    dt = DeltaTable(\"test_table\")\ndt.load_as_version(1)\n

    Use a datetime object

    dt.load_as_version(datetime(2023, 1, 1))\ndt.load_as_version(datetime(2023, 1, 1, tzinfo=timezone.utc))\n

    Use a datetime in string format

    dt.load_as_version(\"2018-01-26T18:30:09Z\")\ndt.load_as_version(\"2018-12-19T16:39:57-08:00\")\ndt.load_as_version(\"2018-01-26T18:30:09.453+00:00\")\n

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.load_version","title":"load_version","text":"
    load_version(version: int) -> None\n

    Load a DeltaTable with a specified version.

    Deprecated

    Load_version and load_with_datetime have been combined into DeltaTable.load_as_version.

    Parameters:

    Name Type Description Default version int

    the identifier of the version of the DeltaTable to load

    required","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.load_with_datetime","title":"load_with_datetime","text":"
    load_with_datetime(datetime_string: str) -> None\n

    Time travel Delta table to the latest version that's created at or before provided datetime_string argument. The datetime_string argument should be an RFC 3339 and ISO 8601 date and time string.

    Deprecated

    Load_version and load_with_datetime have been combined into DeltaTable.load_as_version.

    Parameters:

    Name Type Description Default datetime_string str

    the identifier of the datetime point of the DeltaTable to load

    required Example
    \"2018-01-26T18:30:09Z\"\n\"2018-12-19T16:39:57-08:00\"\n\"2018-01-26T18:30:09.453+00:00\"\n
    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.merge","title":"merge","text":"
    merge(source: Union[pyarrow.Table, pyarrow.RecordBatch, pyarrow.RecordBatchReader, ds.Dataset, pd.DataFrame], predicate: str, source_alias: Optional[str] = None, target_alias: Optional[str] = None, error_on_type_mismatch: bool = True, writer_properties: Optional[WriterProperties] = None, large_dtypes: bool = True, custom_metadata: Optional[Dict[str, str]] = None) -> TableMerger\n

    Pass the source data which you want to merge on the target delta table, providing a predicate in SQL query like format. You can also specify on what to do when the underlying data types do not match the underlying table.

    Parameters:

    Name Type Description Default source Union[Table, RecordBatch, RecordBatchReader, Dataset, DataFrame]

    source data

    required predicate str

    SQL like predicate on how to merge

    required source_alias Optional[str]

    Alias for the source table

    None target_alias Optional[str]

    Alias for the target table

    None error_on_type_mismatch bool

    specify if merge will return error if data types are mismatching :default = True

    True writer_properties Optional[WriterProperties]

    Pass writer properties to the Rust parquet writer

    None large_dtypes bool

    If True, the data schema is kept in large_dtypes.

    True custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.metadata","title":"metadata","text":"
    metadata() -> Metadata\n

    Get the current metadata of the DeltaTable.

    Returns:

    Type Description Metadata

    the current Metadata registered in the transaction log

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.protocol","title":"protocol","text":"
    protocol() -> ProtocolVersions\n

    Get the reader and writer protocol versions of the DeltaTable.

    Returns:

    Type Description ProtocolVersions

    the current ProtocolVersions registered in the transaction log

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.repair","title":"repair","text":"
    repair(dry_run: bool = False, custom_metadata: Optional[Dict[str, str]] = None) -> Dict[str, Any]\n

    Repair the Delta Table by auditing active files that do not exist in the underlying filesystem and removes them. This can be useful when there are accidental deletions or corrupted files.

    Active files are ones that have an add action in the log, but no corresponding remove action. This operation creates a new FSCK transaction containing a remove action for each of the missing or corrupted files.

    Parameters:

    Name Type Description Default dry_run bool

    when activated, list only the files, otherwise add remove actions to transaction log. Defaults to False.

    False custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns: The metrics from repair (FSCK) action.

    Example

    from deltalake import DeltaTable\ndt = DeltaTable('TEST')\ndt.repair(dry_run=False)\n
    Results in
    {'dry_run': False, 'files_removed': ['6-0d084325-6885-4847-b008-82c1cf30674c-0.parquet', 5-4fba1d3e-3e20-4de1-933d-a8e13ac59f53-0.parquet']}\n

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.restore","title":"restore","text":"
    restore(target: Union[int, datetime, str], *, ignore_missing_files: bool = False, protocol_downgrade_allowed: bool = False, custom_metadata: Optional[Dict[str, str]] = None) -> Dict[str, Any]\n

    Run the Restore command on the Delta Table: restore table to a given version or datetime.

    Parameters:

    Name Type Description Default target Union[int, datetime, str]

    the expected version will restore, which represented by int, date str or datetime.

    required ignore_missing_files bool

    whether the operation carry on when some data files missing.

    False protocol_downgrade_allowed bool

    whether the operation when protocol version upgraded.

    False custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns:

    Type Description Dict[str, Any]

    the metrics from restore.

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.schema","title":"schema","text":"
    schema() -> DeltaSchema\n

    Get the current schema of the DeltaTable.

    Returns:

    Type Description Schema

    the current Schema registered in the transaction log

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.to_pandas","title":"to_pandas","text":"
    to_pandas(partitions: Optional[List[Tuple[str, str, Any]]] = None, columns: Optional[List[str]] = None, filesystem: Optional[Union[str, pa_fs.FileSystem]] = None, filters: Optional[FilterType] = None) -> pd.DataFrame\n

    Build a pandas dataframe using data from the DeltaTable.

    Parameters:

    Name Type Description Default partitions Optional[List[Tuple[str, str, Any]]]

    A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax

    None columns Optional[List[str]]

    The columns to project. This can be a list of column names to include (order and duplicates will be preserved)

    None filesystem Optional[Union[str, FileSystem]]

    A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem

    None filters Optional[FilterType]

    A disjunctive normal form (DNF) predicate for filtering rows. If you pass a filter you do not need to pass partitions

    None","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.to_pyarrow_dataset","title":"to_pyarrow_dataset","text":"
    to_pyarrow_dataset(partitions: Optional[List[Tuple[str, str, Any]]] = None, filesystem: Optional[Union[str, pa_fs.FileSystem]] = None, parquet_read_options: Optional[ParquetReadOptions] = None, schema: Optional[pyarrow.Schema] = None, as_large_types: bool = False) -> pyarrow.dataset.Dataset\n

    Build a PyArrow Dataset using data from the DeltaTable.

    Parameters:

    Name Type Description Default partitions Optional[List[Tuple[str, str, Any]]]

    A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax

    None filesystem Optional[Union[str, FileSystem]]

    A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem

    None parquet_read_options Optional[ParquetReadOptions]

    Optional read options for Parquet. Use this to handle INT96 to timestamp conversion for edge cases like 0001-01-01 or 9999-12-31

    None schema Optional[Schema]

    The schema to use for the dataset. If None, the schema of the DeltaTable will be used. This can be used to force reading of Parquet/Arrow datatypes that DeltaLake can't represent in it's schema (e.g. LargeString). If you only need to read the schema with large types (e.g. for compatibility with Polars) you may want to use the as_large_types parameter instead.

    None as_large_types bool

    get schema with all variable size types (list, binary, string) as large variants (with int64 indices). This is for compatibility with systems like Polars that only support the large versions of Arrow types. If schema is passed it takes precedence over this option.

    False

    More info: https://arrow.apache.org/docs/python/generated/pyarrow.dataset.ParquetReadOptions.html

    Example

    deltalake will work with any storage compliant with :class:pyarrow.fs.FileSystem, however the root of the filesystem has to be adjusted to point at the root of the Delta table. We can achieve this by wrapping the custom filesystem into a :class:pyarrow.fs.SubTreeFileSystem.

    import pyarrow.fs as fs\nfrom deltalake import DeltaTable\n\ntable_uri = \"s3://<bucket>/<path>\"\nraw_fs, normalized_path = fs.FileSystem.from_uri(table_uri)\nfilesystem = fs.SubTreeFileSystem(normalized_path, raw_fs)\n\ndt = DeltaTable(table_uri)\nds = dt.to_pyarrow_dataset(filesystem=filesystem)\n

    Returns:

    Type Description Dataset

    the PyArrow dataset in PyArrow

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.to_pyarrow_table","title":"to_pyarrow_table","text":"
    to_pyarrow_table(partitions: Optional[List[Tuple[str, str, Any]]] = None, columns: Optional[List[str]] = None, filesystem: Optional[Union[str, pa_fs.FileSystem]] = None, filters: Optional[FilterType] = None) -> pyarrow.Table\n

    Build a PyArrow Table using data from the DeltaTable.

    Parameters:

    Name Type Description Default partitions Optional[List[Tuple[str, str, Any]]]

    A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax

    None columns Optional[List[str]]

    The columns to project. This can be a list of column names to include (order and duplicates will be preserved)

    None filesystem Optional[Union[str, FileSystem]]

    A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem

    None filters Optional[FilterType]

    A disjunctive normal form (DNF) predicate for filtering rows. If you pass a filter you do not need to pass partitions

    None","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.update","title":"update","text":"
    update(updates: Optional[Dict[str, str]] = None, new_values: Optional[Dict[str, Union[int, float, str, datetime, bool, List[Any]]]] = None, predicate: Optional[str] = None, writer_properties: Optional[WriterProperties] = None, error_on_type_mismatch: bool = True, custom_metadata: Optional[Dict[str, str]] = None) -> Dict[str, Any]\n

    UPDATE records in the Delta Table that matches an optional predicate. Either updates or new_values needs to be passed for it to execute.

    Parameters:

    Name Type Description Default updates Optional[Dict[str, str]]

    a mapping of column name to update SQL expression.

    None new_values Optional[Dict[str, Union[int, float, str, datetime, bool, List[Any]]]]

    a mapping of column name to python datatype.

    None predicate Optional[str]

    a logical expression.

    None writer_properties Optional[WriterProperties]

    Pass writer properties to the Rust parquet writer.

    None error_on_type_mismatch bool

    specify if update will return error if data types are mismatching :default = True

    True custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns: the metrics from update

    Example

    Update some row values with SQL predicate

    This is equivalent to UPDATE table SET deleted = true WHERE id = '3'

    from deltalake import write_deltalake, DeltaTable\nimport pandas as pd\ndf = pd.DataFrame(\n    {\"id\": [\"1\", \"2\", \"3\"],\n    \"deleted\": [False, False, False],\n    \"price\": [10., 15., 20.]\n    })\nwrite_deltalake(\"tmp\", df)\ndt = DeltaTable(\"tmp\")\ndt.update(predicate=\"id = '3'\", updates = {\"deleted\": 'True'})\n\n{'num_added_files': 1, 'num_removed_files': 1, 'num_updated_rows': 1, 'num_copied_rows': 2, 'execution_time_ms': ..., 'scan_time_ms': ...}\n

    Update all row values

    This is equivalent to UPDATE table SET deleted = true, id = concat(id, '_old').

    dt.update(updates = {\"deleted\": 'True', \"id\": \"concat(id, '_old')\"})\n\n{'num_added_files': 1, 'num_removed_files': 1, 'num_updated_rows': 3, 'num_copied_rows': 0, 'execution_time_ms': ..., 'scan_time_ms': ...}\n

    Use Python objects instead of SQL strings

    Use the new_values parameter instead of the updates parameter. For example, this is equivalent to UPDATE table SET price = 150.10 WHERE id = '1'

    dt.update(predicate=\"id = '1_old'\", new_values = {\"price\": 150.10})\n\n{'num_added_files': 1, 'num_removed_files': 1, 'num_updated_rows': 1, 'num_copied_rows': 2, 'execution_time_ms': ..., 'scan_time_ms': ...}\n

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.update_incremental","title":"update_incremental","text":"
    update_incremental() -> None\n

    Updates the DeltaTable to the latest version by incrementally applying newer versions.

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.vacuum","title":"vacuum","text":"
    vacuum(retention_hours: Optional[int] = None, dry_run: bool = True, enforce_retention_duration: bool = True, custom_metadata: Optional[Dict[str, str]] = None) -> List[str]\n

    Run the Vacuum command on the Delta Table: list and delete files no longer referenced by the Delta table and are older than the retention threshold.

    Parameters:

    Name Type Description Default retention_hours Optional[int]

    the retention threshold in hours, if none then the value from configuration.deletedFileRetentionDuration is used or default of 1 week otherwise.

    None dry_run bool

    when activated, list only the files, delete otherwise

    True enforce_retention_duration bool

    when disabled, accepts retention hours smaller than the value from configuration.deletedFileRetentionDuration.

    True custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns: the list of files no longer referenced by the Delta Table and are older than the retention threshold.

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.version","title":"version","text":"
    version() -> int\n

    Get the version of the DeltaTable.

    Returns:

    Type Description int

    The current version of the DeltaTable

    ","boost":2},{"location":"api/delta_table/delta_table_alterer/","title":"TableAlterer","text":"","boost":10},{"location":"api/delta_table/delta_table_alterer/#deltalake.table.TableAlterer","title":"deltalake.table.TableAlterer","text":"
    TableAlterer(table: DeltaTable)\n

    API for various table alteration commands.

    ","boost":10},{"location":"api/delta_table/delta_table_alterer/#deltalake.table.TableAlterer.add_constraint","title":"add_constraint","text":"
    add_constraint(constraints: Dict[str, str], custom_metadata: Optional[Dict[str, str]] = None) -> None\n

    Add constraints to the table. Limited to single constraint at once.

    Parameters:

    Name Type Description Default constraints Dict[str, str]

    mapping of constraint name to SQL-expression to evaluate on write

    required custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Example:

    from deltalake import DeltaTable\ndt = DeltaTable(\"test_table_constraints\")\ndt.alter.add_constraint({\n    \"value_gt_5\": \"value > 5\",\n})\n

    **Check configuration**\n```\ndt.metadata().configuration\n{'delta.constraints.value_gt_5': 'value > 5'}\n```\n
    ","boost":10},{"location":"api/delta_table/delta_table_alterer/#deltalake.table.TableAlterer.drop_constraint","title":"drop_constraint","text":"
    drop_constraint(name: str, raise_if_not_exists: bool = True, custom_metadata: Optional[Dict[str, str]] = None) -> None\n

    Drop constraints from a table. Limited to single constraint at once.

    Parameters:

    Name Type Description Default name str

    constraint name which to drop.

    required raise_if_not_exists bool

    set if should raise if not exists.

    True custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Example:

    from deltalake import DeltaTable\ndt = DeltaTable(\"test_table_constraints\")\ndt.metadata().configuration\n{'delta.constraints.value_gt_5': 'value > 5'}\n

    **Drop the constraint**\n```python\ndt.alter.drop_constraint(name = \"value_gt_5\")\n```\n\n**Configuration after dropping**\n```python\ndt.metadata().configuration\n{}\n```\n
    ","boost":10},{"location":"api/delta_table/delta_table_merger/","title":"TableMerger","text":"","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger","title":"deltalake.table.TableMerger","text":"
    TableMerger(table: DeltaTable, source: pyarrow.RecordBatchReader, predicate: str, source_alias: Optional[str] = None, target_alias: Optional[str] = None, safe_cast: bool = True, writer_properties: Optional[WriterProperties] = None, custom_metadata: Optional[Dict[str, str]] = None)\n

    API for various table MERGE commands.

    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.execute","title":"execute","text":"
    execute() -> Dict[str, Any]\n

    Executes MERGE with the previously provided settings in Rust with Apache Datafusion query engine.

    Returns:

    Name Type Description Dict Dict[str, Any]

    metrics

    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.when_matched_delete","title":"when_matched_delete","text":"
    when_matched_delete(predicate: Optional[str] = None) -> TableMerger\n

    Delete a matched row from the table only if the given predicate (if specified) is true for the matched row. If not specified it deletes all matches.

    Note

    Column names with special characters, such as numbers or spaces should be encapsulated in backticks: \"target.123column\" or \"target.my column\"

    Parameters:

    Name Type Description Default predicate (str | None, Optional)

    SQL like predicate on when to delete.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    Example

    Delete on a predicate

    from deltalake import DeltaTable, write_deltalake\nimport pyarrow as pa\n\ndata = pa.table({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\nwrite_deltalake(\"tmp\", data)\ndt = DeltaTable(\"tmp\")\nnew_data = pa.table({\"x\": [2, 3], \"deleted\": [False, True]})\n\n(\n    dt.merge(\n        source=new_data,\n        predicate='target.x = source.x',\n        source_alias='source',\n        target_alias='target')\n    .when_matched_delete(\n        predicate=\"source.deleted = true\")\n    .execute()\n)\n{'num_source_rows': 2, 'num_target_rows_inserted': 0, 'num_target_rows_updated': 0, 'num_target_rows_deleted': 1, 'num_target_rows_copied': 2, 'num_output_rows': 2, 'num_target_files_added': 1, 'num_target_files_removed': 1, 'execution_time_ms': ..., 'scan_time_ms': ..., 'rewrite_time_ms': ...}\n\ndt.to_pandas().sort_values(\"x\", ignore_index=True)\n   x  y\n0  1  4\n1  2  5\n

    Delete all records that were matched

    dt = DeltaTable(\"tmp\")\n(\n    dt.merge(\n        source=new_data,\n        predicate='target.x = source.x',\n        source_alias='source',\n        target_alias='target')\n    .when_matched_delete()\n    .execute()\n)\n{'num_source_rows': 2, 'num_target_rows_inserted': 0, 'num_target_rows_updated': 0, 'num_target_rows_deleted': 1, 'num_target_rows_copied': 1, 'num_output_rows': 1, 'num_target_files_added': 1, 'num_target_files_removed': 1, 'execution_time_ms': ..., 'scan_time_ms': ..., 'rewrite_time_ms': ...}\n\ndt.to_pandas()\n   x  y\n0  1  4\n

    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.when_matched_update","title":"when_matched_update","text":"
    when_matched_update(updates: Dict[str, str], predicate: Optional[str] = None) -> TableMerger\n

    Update a matched table row based on the rules defined by updates. If a predicate is specified, then it must evaluate to true for the row to be updated.

    Note

    Column names with special characters, such as numbers or spaces should be encapsulated in backticks: \"target.123column\" or \"target.my column\"

    Parameters:

    Name Type Description Default updates Dict[str, str]

    a mapping of column name to update SQL expression.

    required predicate Optional[str]

    SQL like predicate on when to update.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    Example
    from deltalake import DeltaTable, write_deltalake\nimport pyarrow as pa\n\ndata = pa.table({\"x\": [1, 2, 3], \"1y\": [4, 5, 6]})\nwrite_deltalake(\"tmp\", data)\ndt = DeltaTable(\"tmp\")\nnew_data = pa.table({\"x\": [1], \"1y\": [7]})\n\n(\n     dt.merge(\n         source=new_data,\n         predicate=\"target.x = source.x\",\n         source_alias=\"source\",\n         target_alias=\"target\")\n     .when_matched_update(updates={\"x\": \"source.x\", \"`1y`\": \"source.`1y`\"})\n     .execute()\n)\n{'num_source_rows': 1, 'num_target_rows_inserted': 0, 'num_target_rows_updated': 1, 'num_target_rows_deleted': 0, 'num_target_rows_copied': 2, 'num_output_rows': 3, 'num_target_files_added': 1, 'num_target_files_removed': 1, 'execution_time_ms': ..., 'scan_time_ms': ..., 'rewrite_time_ms': ...}\n\ndt.to_pandas()\n   x  y\n0  1  7\n1  2  5\n2  3  6\n
    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.when_matched_update_all","title":"when_matched_update_all","text":"
    when_matched_update_all(predicate: Optional[str] = None) -> TableMerger\n

    Updating all source fields to target fields, source and target are required to have the same field names. If a predicate is specified, then it must evaluate to true for the row to be updated.

    Note

    Column names with special characters, such as numbers or spaces should be encapsulated in backticks: \"target.123column\" or \"target.my column\"

    Parameters:

    Name Type Description Default predicate Optional[str]

    SQL like predicate on when to update all columns.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    Example
    from deltalake import DeltaTable, write_deltalake\nimport pyarrow as pa\n\ndata = pa.table({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\nwrite_deltalake(\"tmp\", data)\ndt = DeltaTable(\"tmp\")\nnew_data = pa.table({\"x\": [1], \"y\": [7]})\n\n(\n    dt.merge(\n        source=new_data,\n        predicate=\"target.x = source.x\",\n        source_alias=\"source\",\n        target_alias=\"target\")\n    .when_matched_update_all()\n    .execute()\n)\n{'num_source_rows': 1, 'num_target_rows_inserted': 0, 'num_target_rows_updated': 1, 'num_target_rows_deleted': 0, 'num_target_rows_copied': 2, 'num_output_rows': 3, 'num_target_files_added': 1, 'num_target_files_removed': 1, 'execution_time_ms': ..., 'scan_time_ms': ..., 'rewrite_time_ms': ...}\n\ndt.to_pandas()\n   x  y\n0  1  7\n1  2  5\n2  3  6\n
    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.when_not_matched_by_source_delete","title":"when_not_matched_by_source_delete","text":"
    when_not_matched_by_source_delete(predicate: Optional[str] = None) -> TableMerger\n

    Delete a target row that has no matches in the source from the table only if the given predicate (if specified) is true for the target row.

    Note

    Column names with special characters, such as numbers or spaces should be encapsulated in backticks: \"target.123column\" or \"target.my column\"

    Parameters:

    Name Type Description Default predicate Optional[str]

    SQL like predicate on when to delete when not matched by source.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.when_not_matched_by_source_update","title":"when_not_matched_by_source_update","text":"
    when_not_matched_by_source_update(updates: Dict[str, str], predicate: Optional[str] = None) -> TableMerger\n

    Update a target row that has no matches in the source based on the rules defined by updates. If a predicate is specified, then it must evaluate to true for the row to be updated.

    Note

    Column names with special characters, such as numbers or spaces should be encapsulated in backticks: \"target.123column\" or \"target.my column\"

    Parameters:

    Name Type Description Default updates Dict[str, str]

    a mapping of column name to update SQL expression.

    required predicate Optional[str]

    SQL like predicate on when to update.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    Example
    from deltalake import DeltaTable, write_deltalake\nimport pyarrow as pa\n\ndata = pa.table({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\nwrite_deltalake(\"tmp\", data)\ndt = DeltaTable(\"tmp\")\nnew_data = pa.table({\"x\": [2, 3, 4]})\n\n(\n   dt.merge(\n       source=new_data,\n       predicate='target.x = source.x',\n       source_alias='source',\n       target_alias='target')\n   .when_not_matched_by_source_update(\n       predicate = \"y > 3\",\n       updates = {\"y\": \"0\"})\n   .execute()\n)\n{'num_source_rows': 3, 'num_target_rows_inserted': 0, 'num_target_rows_updated': 1, 'num_target_rows_deleted': 0, 'num_target_rows_copied': 2, 'num_output_rows': 3, 'num_target_files_added': 1, 'num_target_files_removed': 1, 'execution_time_ms': ..., 'scan_time_ms': ..., 'rewrite_time_ms': ...}\n\ndt.to_pandas().sort_values(\"x\", ignore_index=True)\n   x  y\n0  1  0\n1  2  5\n2  3  6\n
    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.when_not_matched_insert","title":"when_not_matched_insert","text":"
    when_not_matched_insert(updates: Dict[str, str], predicate: Optional[str] = None) -> TableMerger\n

    Insert a new row to the target table based on the rules defined by updates. If a predicate is specified, then it must evaluate to true for the new row to be inserted.

    Note

    Column names with special characters, such as numbers or spaces should be encapsulated in backticks: \"target.123column\" or \"target.my column\"

    Parameters:

    Name Type Description Default updates dict

    a mapping of column name to insert SQL expression.

    required predicate (str | None, Optional)

    SQL like predicate on when to insert.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    Example
    from deltalake import DeltaTable, write_deltalake\nimport pyarrow as pa\n\ndata = pa.table({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\nwrite_deltalake(\"tmp\", data)\ndt = DeltaTable(\"tmp\")\nnew_data = pa.table({\"x\": [4], \"y\": [7]})\n\n(\n    dt.merge(\n        source=new_data,\n        predicate=\"target.x = source.x\",\n        source_alias=\"source\",\n        target_alias=\"target\",)\n    .when_not_matched_insert(\n        updates={\n            \"x\": \"source.x\",\n            \"y\": \"source.y\",\n        })\n    .execute()\n)\n{'num_source_rows': 1, 'num_target_rows_inserted': 1, 'num_target_rows_updated': 0, 'num_target_rows_deleted': 0, 'num_target_rows_copied': 3, 'num_output_rows': 4, 'num_target_files_added': 1, 'num_target_files_removed': 1, 'execution_time_ms': ..., 'scan_time_ms': ..., 'rewrite_time_ms': ...}\n\ndt.to_pandas().sort_values(\"x\", ignore_index=True)\n   x  y\n0  1  4\n1  2  5\n2  3  6\n3  4  7\n
    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.when_not_matched_insert_all","title":"when_not_matched_insert_all","text":"
    when_not_matched_insert_all(predicate: Optional[str] = None) -> TableMerger\n

    Insert a new row to the target table, updating all source fields to target fields. Source and target are required to have the same field names. If a predicate is specified, then it must evaluate to true for the new row to be inserted.

    Note

    Column names with special characters, such as numbers or spaces should be encapsulated in backticks: \"target.123column\" or \"target.my column\"

    Parameters:

    Name Type Description Default predicate Optional[str]

    SQL like predicate on when to insert.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    Example
    from deltalake import DeltaTable, write_deltalake\nimport pyarrow as pa\n\ndata = pa.table({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\nwrite_deltalake(\"tmp\", data)\ndt = DeltaTable(\"tmp\")\nnew_data = pa.table({\"x\": [4], \"y\": [7]})\n\n(\n   dt.merge(\n       source=new_data,\n       predicate='target.x = source.x',\n       source_alias='source',\n       target_alias='target')\n   .when_not_matched_insert_all()\n   .execute()\n)\n{'num_source_rows': 1, 'num_target_rows_inserted': 1, 'num_target_rows_updated': 0, 'num_target_rows_deleted': 0, 'num_target_rows_copied': 3, 'num_output_rows': 4, 'num_target_files_added': 1, 'num_target_files_removed': 1, 'execution_time_ms': ..., 'scan_time_ms': ..., 'rewrite_time_ms': ...}\n\ndt.to_pandas().sort_values(\"x\", ignore_index=True)\n   x  y\n0  1  4\n1  2  5\n2  3  6\n3  4  7\n
    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.with_writer_properties","title":"with_writer_properties","text":"
    with_writer_properties(data_page_size_limit: Optional[int] = None, dictionary_page_size_limit: Optional[int] = None, data_page_row_count_limit: Optional[int] = None, write_batch_size: Optional[int] = None, max_row_group_size: Optional[int] = None) -> TableMerger\n

    Deprecated

    Use .merge(writer_properties = WriterProperties()) instead

    Pass writer properties to the Rust parquet writer, see options https://arrow.apache.org/rust/parquet/file/properties/struct.WriterProperties.html:

    Parameters:

    Name Type Description Default data_page_size_limit Optional[int]

    Limit DataPage size to this in bytes.

    None dictionary_page_size_limit Optional[int]

    Limit the size of each DataPage to store dicts to this amount in bytes.

    None data_page_row_count_limit Optional[int]

    Limit the number of rows in each DataPage.

    None write_batch_size Optional[int]

    Splits internally to smaller batch size.

    None max_row_group_size Optional[int]

    Max number of rows in row group.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    ","boost":2},{"location":"api/delta_table/delta_table_optimizer/","title":"TableOptimizer","text":"","boost":10},{"location":"api/delta_table/delta_table_optimizer/#deltalake.table.TableOptimizer","title":"deltalake.table.TableOptimizer","text":"
    TableOptimizer(table: DeltaTable)\n

    API for various table optimization commands.

    ","boost":10},{"location":"api/delta_table/delta_table_optimizer/#deltalake.table.TableOptimizer.compact","title":"compact","text":"
    compact(partition_filters: Optional[FilterType] = None, target_size: Optional[int] = None, max_concurrent_tasks: Optional[int] = None, min_commit_interval: Optional[Union[int, timedelta]] = None, writer_properties: Optional[WriterProperties] = None, custom_metadata: Optional[Dict[str, str]] = None) -> Dict[str, Any]\n

    Compacts small files to reduce the total number of files in the table.

    This operation is idempotent; if run twice on the same table (assuming it has not been updated) it will do nothing the second time.

    If this operation happens concurrently with any operations other than append, it will fail.

    Parameters:

    Name Type Description Default partition_filters Optional[FilterType]

    the partition filters that will be used for getting the matched files

    None target_size Optional[int]

    desired file size after bin-packing files, in bytes. If not provided, will attempt to read the table configuration value delta.targetFileSize. If that value isn't set, will use default value of 256MB.

    None max_concurrent_tasks Optional[int]

    the maximum number of concurrent tasks to use for file compaction. Defaults to number of CPUs. More concurrent tasks can make compaction faster, but will also use more memory.

    None min_commit_interval Optional[Union[int, timedelta]]

    minimum interval in seconds or as timedeltas before a new commit is created. Interval is useful for long running executions. Set to 0 or timedelta(0), if you want a commit per partition.

    None writer_properties Optional[WriterProperties]

    Pass writer properties to the Rust parquet writer.

    None custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns:

    Type Description Dict[str, Any]

    the metrics from optimize

    Example

    Use a timedelta object to specify the seconds, minutes or hours of the interval.

    from deltalake import DeltaTable, write_deltalake\nfrom datetime import timedelta\nimport pyarrow as pa\n\nwrite_deltalake(\"tmp\", pa.table({\"x\": [1], \"y\": [4]}))\nwrite_deltalake(\"tmp\", pa.table({\"x\": [2], \"y\": [5]}), mode=\"append\")\n\ndt = DeltaTable(\"tmp\")\ntime_delta = timedelta(minutes=10)\ndt.optimize.compact(min_commit_interval=time_delta)\n{'numFilesAdded': 1, 'numFilesRemoved': 2, 'filesAdded': ..., 'filesRemoved': ..., 'partitionsOptimized': 1, 'numBatches': 2, 'totalConsideredFiles': 2, 'totalFilesSkipped': 0, 'preserveInsertionOrder': True}\n

    ","boost":10},{"location":"api/delta_table/delta_table_optimizer/#deltalake.table.TableOptimizer.z_order","title":"z_order","text":"
    z_order(columns: Iterable[str], partition_filters: Optional[FilterType] = None, target_size: Optional[int] = None, max_concurrent_tasks: Optional[int] = None, max_spill_size: int = 20 * 1024 * 1024 * 1024, min_commit_interval: Optional[Union[int, timedelta]] = None, writer_properties: Optional[WriterProperties] = None, custom_metadata: Optional[Dict[str, str]] = None) -> Dict[str, Any]\n

    Reorders the data using a Z-order curve to improve data skipping.

    This also performs compaction, so the same parameters as compact() apply.

    Parameters:

    Name Type Description Default columns Iterable[str]

    the columns to use for Z-ordering. There must be at least one column.

    required partition_filters Optional[FilterType]

    the partition filters that will be used for getting the matched files

    None target_size Optional[int]

    desired file size after bin-packing files, in bytes. If not provided, will attempt to read the table configuration value delta.targetFileSize. If that value isn't set, will use default value of 256MB.

    None max_concurrent_tasks Optional[int]

    the maximum number of concurrent tasks to use for file compaction. Defaults to number of CPUs. More concurrent tasks can make compaction faster, but will also use more memory.

    None max_spill_size int

    the maximum number of bytes to spill to disk. Defaults to 20GB.

    20 * 1024 * 1024 * 1024 min_commit_interval Optional[Union[int, timedelta]]

    minimum interval in seconds or as timedeltas before a new commit is created. Interval is useful for long running executions. Set to 0 or timedelta(0), if you want a commit per partition.

    None writer_properties Optional[WriterProperties]

    Pass writer properties to the Rust parquet writer.

    None custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns:

    Type Description Dict[str, Any]

    the metrics from optimize

    Example

    Use a timedelta object to specify the seconds, minutes or hours of the interval.

    from deltalake import DeltaTable, write_deltalake\nfrom datetime import timedelta\nimport pyarrow as pa\n\nwrite_deltalake(\"tmp\", pa.table({\"x\": [1], \"y\": [4]}))\nwrite_deltalake(\"tmp\", pa.table({\"x\": [2], \"y\": [5]}), mode=\"append\")\n\ndt = DeltaTable(\"tmp\")\ntime_delta = timedelta(minutes=10)\ndt.optimize.z_order([\"x\"], min_commit_interval=time_delta)\n{'numFilesAdded': 1, 'numFilesRemoved': 2, 'filesAdded': ..., 'filesRemoved': ..., 'partitionsOptimized': 0, 'numBatches': 1, 'totalConsideredFiles': 2, 'totalFilesSkipped': 0, 'preserveInsertionOrder': True}\n

    ","boost":10},{"location":"api/delta_table/metadata/","title":"Metadata","text":"","boost":2},{"location":"api/delta_table/metadata/#deltalake.Metadata","title":"deltalake.Metadata dataclass","text":"
    Metadata(table: RawDeltaTable)\n

    Create a Metadata instance.

    ","boost":2},{"location":"api/delta_table/metadata/#deltalake.Metadata.configuration","title":"configuration property","text":"
    configuration: Dict[str, str]\n

    Return the DeltaTable properties.

    ","boost":2},{"location":"api/delta_table/metadata/#deltalake.Metadata.created_time","title":"created_time property","text":"
    created_time: int\n

    Return The time when this metadata action is created, in milliseconds since the Unix epoch of the DeltaTable.

    ","boost":2},{"location":"api/delta_table/metadata/#deltalake.Metadata.description","title":"description property","text":"
    description: str\n

    Return the user-provided description of the DeltaTable.

    ","boost":2},{"location":"api/delta_table/metadata/#deltalake.Metadata.id","title":"id property","text":"
    id: int\n

    Return the unique identifier of the DeltaTable.

    ","boost":2},{"location":"api/delta_table/metadata/#deltalake.Metadata.name","title":"name property","text":"
    name: str\n

    Return the user-provided identifier of the DeltaTable.

    ","boost":2},{"location":"api/delta_table/metadata/#deltalake.Metadata.partition_columns","title":"partition_columns property","text":"
    partition_columns: List[str]\n

    Return an array containing the names of the partitioned columns of the DeltaTable.

    ","boost":2},{"location":"how-delta-lake-works/architecture-of-delta-table/","title":"Architecture of a Delta Lake table","text":"

    A Delta table consists of Parquet files that contain data and a transaction log that stores metadata about the transactions.

    Let's create a Delta table, perform some operations, and inspect the files that are created.

    "},{"location":"how-delta-lake-works/architecture-of-delta-table/#delta-lake-transaction-examples","title":"Delta Lake transaction examples","text":"

    Start by creating a pandas DataFrame and writing it out to a Delta table.

    import pandas as pd\nfrom deltalake import DeltaTable, write_deltalake\n\ndf = pd.DataFrame({\"num\": [1, 2, 3], \"letter\": [\"a\", \"b\", \"c\"]})\nwrite_deltalake(\"tmp/some-table\", df)\n

    Now inspect the files created in storage:

    tmp/some-table\n\u251c\u2500\u2500 0-62dffa23-bbe1-4496-8fb5-bff6724dc677-0.parquet\n\u2514\u2500\u2500 _delta_log\n    \u2514\u2500\u2500 00000000000000000000.json\n

    The Parquet file stores the data that was written. The _delta_log directory stores metadata about the transactions. Let's inspect the _delta_log/00000000000000000000.json file.

    {\n  \"protocol\": {\n    \"minReaderVersion\": 1,\n    \"minWriterVersion\": 1\n  }\n}\n{\n  \"metaData\": {\n    \"id\": \"b96ea1a2-1830-4da2-8827-5334cc6104ed\",\n    \"name\": null,\n    \"description\": null,\n    \"format\": {\n      \"provider\": \"parquet\",\n      \"options\": {}\n    },\n    \"schemaString\": \"{\\\"type\\\":\\\"struct\\\",\\\"fields\\\":[{\\\"name\\\":\\\"num\\\",\\\"type\\\":\\\"long\\\",\\\"nullable\\\":true,\\\"metadata\\\":{}},{\\\"name\\\":\\\"letter\\\",\\\"type\\\":\\\"string\\\",\\\"nullable\\\":true,\\\"metadata\\\":{}}]}\",\n    \"partitionColumns\": [],\n    \"createdTime\": 1701740315599,\n    \"configuration\": {}\n  }\n}\n{\n  \"add\": {\n    \"path\": \"0-62dffa23-bbe1-4496-8fb5-bff6724dc677-0.parquet\",\n    \"size\": 2208,\n    \"partitionValues\": {},\n    \"modificationTime\": 1701740315597,\n    \"dataChange\": true,\n    \"stats\": \"{\\\"numRecords\\\": 3, \\\"minValues\\\": {\\\"num\\\": 1, \\\"letter\\\": \\\"a\\\"}, \\\"maxValues\\\": {\\\"num\\\": 3, \\\"letter\\\": \\\"c\\\"}, \\\"nullCount\\\": {\\\"num\\\": 0, \\\"letter\\\": 0}}\"\n  }\n}\n{\n  \"commitInfo\": {\n    \"timestamp\": 1701740315602,\n    \"operation\": \"CREATE TABLE\",\n    \"operationParameters\": {\n      \"location\": \"file:///Users/matthew.powers/Documents/code/delta/delta-examples/notebooks/python-deltalake/tmp/some-table\",\n      \"metadata\": \"{\\\"configuration\\\":{},\\\"created_time\\\":1701740315599,\\\"description\\\":null,\\\"format\\\":{\\\"options\\\":{},\\\"provider\\\":\\\"parquet\\\"},\\\"id\\\":\\\"b96ea1a2-1830-4da2-8827-5334cc6104ed\\\",\\\"name\\\":null,\\\"partition_columns\\\":[],\\\"schema\\\":{\\\"fields\\\":[{\\\"metadata\\\":{},\\\"name\\\":\\\"num\\\",\\\"nullable\\\":true,\\\"type\\\":\\\"long\\\"},{\\\"metadata\\\":{},\\\"name\\\":\\\"letter\\\",\\\"nullable\\\":true,\\\"type\\\":\\\"string\\\"}],\\\"type\\\":\\\"struct\\\"}}\",\n      \"protocol\": \"{\\\"minReaderVersion\\\":1,\\\"minWriterVersion\\\":1}\",\n      \"mode\": \"ErrorIfExists\"\n    },\n    \"clientVersion\": \"delta-rs.0.17.0\"\n  }\n}\n

    The tranasction log file contains the following information:

    • the files added to the Delta table
    • schema of the files
    • column level metadata including the min/max value for each file

    Create another pandas DataFrame and append it to the Delta table to see how this transaction is recorded.

    df = pd.DataFrame({\"num\": [8, 9], \"letter\": [\"dd\", \"ee\"]})\nwrite_deltalake(f\"{cwd}/tmp/delta-table\", df, mode=\"append\")\n

    Here are the files in storage:

    tmp/some-table\n\u251c\u2500\u2500 0-62dffa23-bbe1-4496-8fb5-bff6724dc677-0.parquet\n\u251c\u2500\u2500 1-57abb6fb-2249-43ba-a7be-cf09bcc230de-0.parquet\n\u2514\u2500\u2500 _delta_log\n    \u251c\u2500\u2500 00000000000000000000.json\n    \u2514\u2500\u2500 00000000000000000001.json\n

    Here are the contents of the _delta_log/00000000000000000001.json file:

    {\n  \"add\": {\n    \"path\": \"1-57abb6fb-2249-43ba-a7be-cf09bcc230de-0.parquet\",\n    \"size\": 2204,\n    \"partitionValues\": {},\n    \"modificationTime\": 1701740386169,\n    \"dataChange\": true,\n    \"stats\": \"{\\\"numRecords\\\": 2, \\\"minValues\\\": {\\\"num\\\": 8, \\\"letter\\\": \\\"dd\\\"}, \\\"maxValues\\\": {\\\"num\\\": 9, \\\"letter\\\": \\\"ee\\\"}, \\\"nullCount\\\": {\\\"num\\\": 0, \\\"letter\\\": 0}}\"\n  }\n}\n{\n  \"commitInfo\": {\n    \"timestamp\": 1701740386169,\n    \"operation\": \"WRITE\",\n    \"operationParameters\": {\n      \"partitionBy\": \"[]\",\n      \"mode\": \"Append\"\n    },\n    \"clientVersion\": \"delta-rs.0.17.0\"\n  }\n}\n

    The transaction log records that the second file has been persisted in the Delta table.

    Now create a third pandas DataFrame and overwrite the Delta table with the new data.

    df = pd.DataFrame({\"num\": [11, 22], \"letter\": [\"aa\", \"bb\"]})\nwrite_deltalake(f\"{cwd}/tmp/delta-table\", df, mode=\"append\")\n

    Here are the files in storage:

    tmp/some-table\n\u251c\u2500\u2500 0-62dffa23-bbe1-4496-8fb5-bff6724dc677-0.parquet\n\u251c\u2500\u2500 1-57abb6fb-2249-43ba-a7be-cf09bcc230de-0.parquet\n\u251c\u2500\u2500 2-95ef2108-480c-4b89-96f0-ff9185dab9ad-0.parquet\n\u2514\u2500\u2500 _delta_log\n    \u251c\u2500\u2500 00000000000000000000.json\n    \u251c\u2500\u2500 00000000000000000001.json\n    \u2514\u2500\u2500 00000000000000000002.json\n

    Here are the contents of the _delta_log/0002.json file:

    {\n  \"add\": {\n    \"path\": \"2-95ef2108-480c-4b89-96f0-ff9185dab9ad-0.parquet\",\n    \"size\": 2204,\n    \"partitionValues\": {},\n    \"modificationTime\": 1701740465102,\n    \"dataChange\": true,\n    \"stats\": \"{\\\"numRecords\\\": 2, \\\"minValues\\\": {\\\"num\\\": 11, \\\"letter\\\": \\\"aa\\\"}, \\\"maxValues\\\": {\\\"num\\\": 22, \\\"letter\\\": \\\"bb\\\"}, \\\"nullCount\\\": {\\\"num\\\": 0, \\\"letter\\\": 0}}\"\n  }\n}\n{\n  \"remove\": {\n    \"path\": \"0-62dffa23-bbe1-4496-8fb5-bff6724dc677-0.parquet\",\n    \"deletionTimestamp\": 1701740465102,\n    \"dataChange\": true,\n    \"extendedFileMetadata\": false,\n    \"partitionValues\": {},\n    \"size\": 2208\n  }\n}\n{\n  \"remove\": {\n    \"path\": \"1-57abb6fb-2249-43ba-a7be-cf09bcc230de-0.parquet\",\n    \"deletionTimestamp\": 1701740465102,\n    \"dataChange\": true,\n    \"extendedFileMetadata\": false,\n    \"partitionValues\": {},\n    \"size\": 2204\n  }\n}\n{\n  \"commitInfo\": {\n    \"timestamp\": 1701740465102,\n    \"operation\": \"WRITE\",\n    \"operationParameters\": {\n      \"mode\": \"Overwrite\",\n      \"partitionBy\": \"[]\"\n    },\n    \"clientVersion\": \"delta-rs.0.17.0\"\n  }\n}\n

    This transaction adds a data file and marks the two exising data files for removal. Marking a file for removal in the transaction log is known as \"tombstoning the file\" or a \"logical delete\". This is different from a \"physical delete\" which actually removes the data file from storage.

    "},{"location":"how-delta-lake-works/architecture-of-delta-table/#how-delta-table-operations-differ-from-data-lakes","title":"How Delta table operations differ from data lakes","text":"

    Data lakes consist of data files persisted in storage. They don't have a transaction log that retain metadata about the transactions.

    Data lakes perform transactions differently than Delta tables.

    When you perform an overwrite tranasction with a Delta table, you logically delete the exiting data without physically removing it.

    Data lakes don't support logical deletes, so you have to physically delete the data from storage.

    Logical data operations are safer because they can be rolled back if they don't complete successfully. Physically removing data from storage can be dangerous, especially if it's before a transaction is complete.

    We're now ready to look into Delta Lake ACID transactions in more detail.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/","title":"Delta Lake Transactions","text":"

    This page teaches you about Delta Lake transactions and why transactions are important in production data settings. Data lakes don\u2019t support transactions and this is a huge downside because they offer a poor user experience, lack functionality, and can easily be corrupted.

    Transactions on Delta Lake tables are operations that change the state of table and record descriptive entries (metadata) of those changes to the Delta Lake transaction log. Here are some examples of transactions:

    • Deleting rows
    • Appending to the table
    • Compacting small files
    • Upserting
    • Overwriting rows

    All Delta Lake write operations are transactions in Delta tables. Reads actually aren\u2019t technically transactions because they don\u2019t result in new entries being appended to the transaction log.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#what-are-transactions","title":"What are transactions?","text":"

    Transactions are any Delta operation that change the underlying files of a Delta table and result in new entries metadata entries in the transaction log. Some Delta operations rearrange data in the existing table (like Z Ordering the table or compacting the small files) and these are also transactions. Let\u2019s look at a simple example.

    Suppose you have a Delta table with the following data:

    num animal\n1   cat\n2   dog\n3   snake\n

    Here\u2019s how to create this Delta table:

    import pandas as pd\nfrom deltalake import write_deltalake, DeltaTable\n\ndf = pd.DataFrame({\"num\": [1, 2, 3], \"animal\": [\"cat\", \"dog\", \"snake\"]})\nwrite_deltalake(\"tmp/my-delta-table\", df)\n

    Here are the files created in storage.

    tmp/my-delta-table\n\u251c\u2500\u2500 0-fea2de92-861a-423e-9708-a9e91dafb27b-0.parquet\n\u2514\u2500\u2500 _delta_log\n    \u2514\u2500\u2500 00000000000000000000.json\n

    Let\u2019s perform an operation to delete every animal from the Delta table that is a cat.

    dt = DeltaTable(\"tmp/my-delta-table\")\ndt.delete(\"animal = 'cat'\")\n

    Let\u2019s take a look at the contents of the Delta table now that the transaction is complete:

    tmp/my-delta-table\n\u251c\u2500\u2500 0-fea2de92-861a-423e-9708-a9e91dafb27b-0.parquet\n\u251c\u2500\u2500 _delta_log\n\u2502   \u251c\u2500\u2500 00000000000000000000.json\n\u2502   \u2514\u2500\u2500 00000000000000000001.json\n\u2514\u2500\u2500 part-00001-90312b96-b487-4a8f-9edc-1b9b3963f136-c000.snappy.parquet\n

    Notice the 00000000000000000001.json file that was added to the transaction log to record this transaction. Let\u2019s inspect the content of the file.

    {\n  \"add\": {\n    \"path\": \"part-00001-90312b96-b487-4a8f-9edc-1b9b3963f136-c000.snappy.parquet\",\n    \"partitionValues\": {},\n    \"size\": 858,\n    \"modificationTime\": 1705070631953,\n    \"dataChange\": true,\n    \"stats\": \"{\\\"numRecords\\\":2,\\\"minValues\\\":{\\\"num\\\":2,\\\"animal\\\":\\\"dog\\\"},\\\"maxValues\\\":{\\\"num\\\":3,\\\"animal\\\":\\\"snake\\\"},\\\"nullCount\\\":{\\\"num\\\":0,\\\"animal\\\":0}}\",\n    \"tags\": null,\n    \"deletionVector\": null,\n    \"baseRowId\": null,\n    \"defaultRowCommitVersion\": null,\n    \"clusteringProvider\": null\n  }\n}\n{\n  \"remove\": {\n    \"path\": \"0-fea2de92-861a-423e-9708-a9e91dafb27b-0.parquet\",\n    \"dataChange\": true,\n    \"deletionTimestamp\": 1705070631953,\n    \"extendedFileMetadata\": true,\n    \"partitionValues\": {},\n    \"size\": 895\n  }\n}\n{\n  \"commitInfo\": {\n    \"timestamp\": 1705070631953,\n    \"operation\": \"DELETE\",\n    \"operationParameters\": {\n      \"predicate\": \"animal = 'cat'\"\n    },\n    \"readVersion\": 0,\n    \"operationMetrics\": {\n      \"execution_time_ms\": 8013,\n      \"num_added_files\": 1,\n      \"num_copied_rows\": 2,\n      \"num_deleted_rows\": 1,\n      \"num_removed_files\": 1,\n      \"rewrite_time_ms\": 2,\n      \"scan_time_ms\": 5601\n    },\n    \"clientVersion\": \"delta-rs.0.17.0\"\n  }\n}\n

    We can see that this transaction includes two components:

    • Remove file 0-fea2de92-861a-423e-9708-a9e91dafb27b-0.parquet
    • Add file part-00001-90312b96-b487-4a8f-9edc-1b9b3963f136-c000.snappy.parquet

    Transactions are recorded in the transaction log. The transaction log is also referred to as the table metadata and is the _delta_log directory in storage.

    Let\u2019s see how Delta Lake implements transactions.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#how-delta-lake-implements-transactions","title":"How Delta Lake implements transactions","text":"

    Here is how Delta Lake implements transactions:

    1. Read the existing metadata
    2. Read the existing Parquet data files
    3. Write the Parquet files for the current transaction
    4. Record the new transaction in the transaction log (if there are no conflicts)

    Let\u2019s recall our delete operation from the prior section and see how it fits into this transaction model:

    1. We read the existing metadata to find the file paths for the existing Parquet files
    2. We read the existing Parquet files and identify the files that contains data that should be removed
    3. We write new Parquet files with the deleted data filtered out
    4. Once the new Parquet files are written, we check for conflicts and then make an entry in the transaction log. The next section will discuss transaction conflicts in more detail.

    Blind append operations can skip a few steps and are executed as follows:

    1. Write the Parquet files for the current transaction
    2. Record the new transaction in the metadata

    Delta implements a non locking MVCC (multi version concurrency control) so writers optimistically write new data and simply abandon the transaction if it conflicts at the end. The alternative would be getting a lock at the start thereby guaranteeing the transaction immediately.

    Let\u2019s look at the case when a Delta Lake transaction conflicts.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#how-delta-lake-transactions-can-conflict","title":"How Delta Lake transactions can conflict","text":"

    Suppose you have a transaction that deletes a row of data that\u2019s stored in FileA (Transaction 1). While this job is running, there is another transaction that deletes some other rows in FileA (Transaction 2). Transaction 1 finishes running first and is recorded in the metadata.

    Before Transaction 2 is recorded as a transaction, it will check the metadata, find that Transaction 2 conflicts with a transaction that was already recorded (from Transaction 1), and error without recording a new transaction.

    Transactions 2 will write Parquet data files, but will not be recorded as a transaction, so the data files will be ignored. The zombie Parquet files can be easily cleaned up via subsequent vacuum operations.

    Transaction 2 must fail otherwise it would cause the data to be incorrect.

    Delta Lake transactions prevent users from making changes that would corrupt the table. Transaction conflict behavior can differ based on isolation level, which controls the degree to which a transaction must be isolated from modifications made by other concurrent transactions. More about this in the concurrency section.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#transactions-rely-on-atomic-primitives-storage-guarantees","title":"Transactions rely on atomic primitives storage guarantees","text":"

    Suppose you have two transactions that are finishishing at the same exact time. Both of these transactions look at the existing Delta Lake transaction log, see that the latest transaction was 003.json and determine that the next entry should be 004.json.

    If both transactions are recorded in the 004.json file, then one of them will be clobbered, and the transaction log entry for the clobbered metadata entry will be lost.

    Delta tables rely on storage systems that provide atomic primitives for safe concurrency. The storage system must allow Delta Lake to write the file, only if it does not exist already, and error out otherwise. The storage system must NOT permit concurrent writers to overwrite existing metadata entries.

    Some clouds have filesystems that don\u2019t explicitly support these atomic primitives, and therefore must be coupled with other services to provide the necessary guarantees.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#delta-lake-transactions-are-only-for-a-single-table","title":"Delta Lake transactions are only for a single table","text":"

    Delta Lake transactions are only valid for a single table.

    Some databases offer transaction support for operations that impact multiple tables. Delta Lake does not support multi-table transactions.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#data-lakes-dont-support-transactions","title":"Data lakes don\u2019t support transactions","text":"

    Data lakes consist of many files in a storage system (e.g. a cloud storage system) and don\u2019t support transactions.

    Data lakes don\u2019t have a metadata layer, conflict resolution, or any way to store information about transactions.

    Data lakes are prone to multiple types of errors because they don\u2019t support transactions:

    • Easy to corrupt
    • Downtime/unstable state while jobs are running
    • Operations can conflict

    Data lakes have many downsides and it\u2019s almost always better to use a lakehouse storage system like Delta Lake compared to a data lake.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#acid-transactions","title":"ACID Transactions","text":"

    We\u2019ve already explored how Delta Lake supports transactions. This section explains how Delta Lake transactions have the Atomic, Consistent, Isolated and Durable (ACID transaction) properties. Reading this section is optional.

    ACID transactions are commonplace in databases but notably absent for data lakes.

    Delta Lake\u2019s ACID transaction support is one of the major reasons it is almost always a better option than a data lake.

    Let\u2019s explore how Delta Lake allows for ACID transactions.

    Atomic transactions

    An atomic transaction either fully completes or fully fails, with nothing in between.

    Delta Lake transactions are atomic, unlike data lake transactions that are not atomic.

    Suppose you have a job that\u2019s writing 100 files to a table. Further suppose that the job errors out and the cluster dies after writing 40 files:

    • For a Delta table, no additional data will be added to the table. Parquet files were written to the table, but the job errored, so no transaction log entry was added and no data was added to the table.
    • For a data lake, the 40 files are added and the transaction \u201cpartially succeeds\u201d.

    For data tables, it\u2019s almost always preferable to have a transaction that \u201cfully fails\u201d instead of one that \u201cpartially succeeds\u201d because partial writes are hard to unwind and debug.

    Delta Lake implements atomic transactions by writing data files first before making a new entry in the Delta transaction log.

    These guarantees are provided at the protocol level through the \"transaction\" abstraction. We\u2019ve already discussed what constitutes a transaction for Delta Lake.

    If there is an error with the transaction and some files don\u2019t get written, then no metadata entry is made and the partial data write is ignored. The zombie Parquet files can be easily cleaned up via subsequent vacuum operations.

    Now let\u2019s look at how Delta Lake also provides consistent transactions.

    Consistent transactions

    Consistency means that transactions won\u2019t violate integrity constraints on the Delta table.

    Delta Lake has two types of consistency checks:

    • Schema enforcement checks
    • Column constraints

    Schema enforcement checks verify that new data appended to a Delta table matches the schema of the existing table. You cannot append data with a different schema, unless you enable schema evolution.

    Delta Lake column constraints allow users to specify the requirements of data that\u2019s added to a Delta table. For example, if you have an age column with a constraint that requires the value to be positive, then Delta Lake will reject appends of any data that doesn\u2019t meet the constraint.

    Data lakes don\u2019t support schema enforcement or column constraints. That\u2019s another reason why data lakes are not ACID-compliant.

    Isolated transactions

    Isolation means that transactions are applied to a Delta table sequentially.

    Delta Lake transactions are persisted in monotonically increasing transaction files, as we saw in the previous example. First 00000000000000000000.json, then 00000000000000000001.json, then 00000000000000000002.json, and so on.

    Delta Lake uses concurrency control to ensure that transactions are executed sequentially, even when user operations are performed concurrently. The next page of this guide explains concurrency in Delta Lake in detail.

    Durable transactions

    Delta tables are generally persisted in cloud object stores which provide durability guarantees.

    Durability means that all transactions that are successfully completed will always remain persisted, even if there are service outages or program crashes.

    Suppose you have a Delta table that\u2019s persisted in Azure blob storage. The Delta table transactions that are committed will always remain available, even in these circumstances:

    • When there are Azure service outages
    • If a computation cluster that\u2019s writing the Delta table crashes for some reason
    • Two operations are running concurrently and one of them fails

    Successful transactions are always registered in the Delta table and persisted no matter what.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#conclusion","title":"Conclusion","text":"

    Delta Lake supports transactions which provide necessary reliability guarantees for production data systems.

    Vanilla data lakes don\u2019t provide transactions and this can cause nasty bugs and a bad user experience. Let\u2019s look at a couple of scenarios when the lack of transactions cause a poor user experience:

    • While running a compaction operation on a data lake, newly compacted \u201cright sized\u201d files are added before the small files are deleted. If you read the data lake while this operation is running, you will see duplicate data.
    • While writing to a data lake, a job might fail, which leaves behind partially written files. These files are corrupt, which means that the data lake cannot be read until the corrupt files are manually removed.
    • Users want to run a simple DML operation like deleting a few rows of data which require a few files to be rewritten. This operation renders the data lake unusable until it\u2019s done running.

    Transactions are a key advantage of Delta Lake vs. data lakes. There are many other advantages, but proper transactions are necessary in production data environments.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/","title":"Delta Lake File Skipping","text":"

    Delta tables store file-level metadata information, which allows for a powerful optimization called file skipping.

    This page explains how Delta Lake implements file skipping, how to optimize your tables to maximize file skipping, and the benefits of file skipping.

    Let\u2019s start by looking at the file-level metadata in Delta tables.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/#delta-lake-file-metadata","title":"Delta Lake file metadata","text":"

    Delta Lake stores metadata about each file's min/max values in the table. Query engines can skip entire files when they don\u2019t contain data that\u2019s relevant to the query.

    Suppose you have a Delta table with data stored in two files and has the following metadata.

    filename    min_name    max_name    min_age max_age\nfileA       alice       joy         12      46  \nfileB       allan       linda       34      78\n

    Suppose you want to run the following query: select * from the_table where age &lt; 20.

    The engine only needs to read fileA to execute this query. fileB has a min_age of 34, so we know there aren\u2019t any rows of data with an age less than 20.

    The benefit of file skipping depends on the query and the data layout of the Delta table. Some queries cannot take advantage of any file skipping. Here\u2019s an example query that does not benefit from file skipping: select * from the_table group by age.

    Let\u2019s recreate this example with Polars to drive the point home.

    Start by writing out one file of data:

    import polars as pl\nfrom deltalake import DeltaTable\n\ndf = pl.DataFrame({\"name\": [\"alice\", \"cat\", \"joy\"], \"age\": [12, 35, 46]})\ndf.write_delta(\"tmp/a_table\")\n

    Now, write out another file of data:

    df = pl.DataFrame({\"name\": [\"allan\", \"brian\", \"linda\"], \"age\": [34, 35, 78]})\ndf.write_delta(\"tmp/a_table\", mode=\"append\")\n

    Here are the contents of the Delta table:

    tmp/a_table\n\u251c\u2500\u2500 0-7d414a88-a634-4c2f-9c5b-c29b6ee5f524-0.parquet\n\u251c\u2500\u2500 1-0617ef60-b17b-46a5-9b0f-c7dda1b73eee-0.parquet\n\u2514\u2500\u2500 _delta_log\n    \u251c\u2500\u2500 00000000000000000000.json\n    \u2514\u2500\u2500 00000000000000000001.json\n

    Now run a query to fetch all the records where the age is less than 20:

    pl.scan_delta(\"tmp/a_table\").filter(pl.col(\"age\") < 20).collect()\n
    +-------+-----+\n| name  | age |\n| ---   | --- |\n| str   | i64 |\n+=============+\n| alice | 12  |\n+-------+-----+\n

    Polars can use the Delta table metadata to skip the file that does not contain data relevant to the query.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/#how-delta-lake-implements-file-skipping","title":"How Delta Lake implements file skipping","text":"

    Here\u2019s how engines execute queries on Delta tables:

    • Start by reading the transaction log to get the file paths, file sizes, and min/max value for each column
    • Parse the query and push down the predicates to skip files
    • Read the minimal subset of the files needed for the query

    Some file formats don\u2019t allow for file skipping. For example, CSV files don\u2019t have file-level metadata, so query engines can\u2019t read a minimal subset of the data. The query engine has to check all the files, even if they don\u2019t contain any relevant data.

    When data is in Parquet files, the query engine can open up all the files, read the footers, build the file-level metadata, and perform file skipping. Fetching metadata in each file is slower than grabbing the pre-built file-level metadata from the transaction log.

    Now, let\u2019s see how to structure your tables to allow for more file skipping.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/#file-skipping-for-different-file-sizes","title":"File skipping for different file sizes","text":"

    Delta tables store data in files. Smaller files allow for more file skipping compared to bigger files.

    However, an excessive number of small files isn\u2019t good because it creates I/O overhead and slows down queries.

    Your Delta tables should have files that are \u201cright-sized\u201d. For a table with 150 GB of data, 5 GB files would probably be too large, and 10 KB files would be too small. It\u2019s generally best to store data in files that are between 100 MB and 1 GB.

    Delta Lake has an optimize function that performs small file compaction, so you don\u2019t need to program this logic yourself.

    Now, let's investigate how to store data in files to maximize the file skipping opportunities.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/#how-to-maximize-file-skipping","title":"How to maximize file skipping","text":"

    You can maximize file-skipping by colocating similar data in the same files.

    Suppose you have a table with test scores and frequently run queries that filter based on the test_score column.

    filename    min_test_score  max_test_score\nfileA       45              100\nfileB       65              98\nfileC       50              96\n

    Suppose you want to run the following query: select * from exams where test_score > 90.

    This query cannot skip files, given the current organization of the data. You can rearrange the data to colocate similar test scores in the same files to allow for file skipping. Here\u2019s the new layout:

    filename    min_test_score  max_test_score\nfileD       45              70\nfileE       55              80\nfileF       78              100\n

    The query (select * from exams where test_score > 90) can skip two of the three files with the new Delta table layout. The query engine only has to read fileF for this query.

    Now, let\u2019s look at how file skipping works with string values.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/#how-file-skipping-works-with-strings","title":"How file skipping works with strings","text":"

    File skipping is also effective when filtering on string values.

    Suppose you have a table with person_name and country columns. There are millions of rows of data. Here are the first three rows of data:

    person_name country\nperson1     angola\nperson2     china\nperson3     mexico\n

    The Delta table contains three files with the following metadata:

    filename    min_country max_country\nfileA       albania     mali\nfileB       libia       paraguay\nfileC       oman        zimbabwe\n

    Suppose you want to run the following query: select * from some_people where country = 'austria'.

    You only need to read the data in fileA to run this query. The min_country value for fileB and fileC are greater than \u201caustria\u201d, so we know those files don\u2019t contain any data relevant to the query.

    File skipping can also be a robust optimization for string values. Now, let\u2019s see how file skipping works for partitioned tables.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/#file-skipping-for-partitioned-tables","title":"File skipping for partitioned tables","text":"

    You can partition Delta tables for file skipping as well. Suppose we have the same data as in the previous section, but the table is partitioned by country.

    Here\u2019s the Delta table:

    filename    partition\nfileA       albania\nfileB       libia\nfileC       oman\nfileD       jamaica\nfileE       albania\nfileF       oman\n

    Suppose you want to run the following query on this partitioned table: select * from some_partitioned_table where country = 'albania'.

    You only need to read fileA and fileE to execute this query. Delta Lake provides the file-level partition metadata in the transaction log so that this query will run quickly.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/#conclusion","title":"Conclusion","text":"

    Delta Lake allows for file skipping, which is a powerful performance optimization.

    Delta Lake also provides built-in utilities to colocate data in the same files like partitioning, Z Ordering, and compaction to improve file skipping.

    Delta Lake users need to know how to assess the tradeoffs of these techniques to optimize file skipping. Users also need to understand the most frequent query patterns of their tables to best allow for file maximal file skipping.

    "},{"location":"integrations/delta-lake-arrow/","title":"Delta Lake Arrow Integrations","text":"

    Delta Lake tables can be exposed as Arrow tables and Arrow datasets, which allows for interoperability with a variety of query engines.

    This page shows you how to convert Delta tables to Arrow data structures and teaches you the difference between Arrow tables and Arrow datasets. Tables are \"eager\" and datasets are \"lazy\", which has important performance implications, keep reading to learn more!

    "},{"location":"integrations/delta-lake-arrow/#delta-lake-to-arrow-dataset","title":"Delta Lake to Arrow Dataset","text":"

    Delta tables can easily be exposed as Arrow datasets. This makes it easy for any query engine that can read Arrow datasets to read a Delta table.

    Let's take a look at the h2o groupby dataset that contains 9 columns of data. Here are three representative rows of data:

    +-------+-------+--------------+-------+-------+--------+------+------+---------+\n| id1   | id2   | id3          |   id4 |   id5 |    id6 |   v1 |   v2 |      v3 |\n|-------+-------+--------------+-------+-------+--------+------+------+---------|\n| id016 | id046 | id0000109363 |    88 |    13 | 146094 |    4 |    6 | 18.8377 |\n| id039 | id087 | id0000466766 |    14 |    30 | 111330 |    4 |   14 | 46.7973 |\n| id047 | id098 | id0000307804 |    85 |    23 | 187639 |    3 |    5 | 47.5773 |\n+-------+-------+--------------+-------+-------+--------+------+------+---------+\n

    Here's how to expose the Delta table as a PyArrow dataset and run a query with DuckDB:

    import duckdb\nfrom deltalake import DeltaTable\n\ntable = DeltaTable(\"delta/G1_1e9_1e2_0_0\")\ndataset = table.to_pyarrow_dataset()\nquack = duckdb.arrow(dataset)\nquack.filter(\"id1 = 'id016' and v2 > 10\")\n

    Here's the result:

    \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502   id1   \u2502   id2   \u2502     id3      \u2502  id4  \u2502  id5  \u2502   id6   \u2502  v1   \u2502  v2   \u2502    v3     \u2502\n\u2502 varchar \u2502 varchar \u2502   varchar    \u2502 int32 \u2502 int32 \u2502  int32  \u2502 int32 \u2502 int32 \u2502  double   \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 id016   \u2502 id054   \u2502 id0002309114 \u2502    62 \u2502    95 \u2502 7180859 \u2502     4 \u2502    13 \u2502  7.750173 \u2502\n\u2502 id016   \u2502 id044   \u2502 id0003968533 \u2502    63 \u2502    98 \u2502 2356363 \u2502     4 \u2502    14 \u2502  3.942417 \u2502\n\u2502 id016   \u2502 id034   \u2502 id0001082839 \u2502    58 \u2502    73 \u2502 8039808 \u2502     5 \u2502    12 \u2502 76.820135 \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 ? rows (>9999 rows, 3 shown)                                                 9 columns \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n

    Arrow datasets allow for the predicates to get pushed down to the query engine, so the query is executed quickly.

    "},{"location":"integrations/delta-lake-arrow/#delta-lake-to-arrow-table","title":"Delta Lake to Arrow Table","text":"

    You can also run the same query with DuckDB on an Arrow table:

    quack = duckdb.arrow(table.to_pyarrow_table())\nquack.filter(\"id1 = 'id016' and v2 > 10\")\n

    This returns the same result, but it runs slower.

    "},{"location":"integrations/delta-lake-arrow/#difference-between-arrow-dataset-and-arrow-table","title":"Difference between Arrow Dataset and Arrow Table","text":"

    Arrow Datasets are lazy and allow for full predicate pushdown unlike Arrow tables which are eagerly loaded into memory.

    The previous DuckDB queries were run on a 1 billion row dataset that's roughly 50 GB when stored as an uncompressed CSV file. Here are the runtimes when the data is stored in a Delta table and the queries are executed on a 2021 Macbook M1 with 64 GB of RAM:

    • Arrow table: 17.1 seconds
    • Arrow dataset: 0.01 seconds

    The query runs much faster on an Arrow dataset because the predicates can be pushed down to the query engine and lots of data can be skipped.

    Arrow tables are eagerly materialized in memory and don't allow for the same amount of data skipping.

    "},{"location":"integrations/delta-lake-arrow/#multiple-query-engines-can-query-arrow-datasets","title":"Multiple query engines can query Arrow Datasets","text":"

    Other query engines like DataFusion can also query Arrow datasets, see the following example:

    from datafusion import SessionContext\n\nctx = SessionContext()\nctx.register_dataset(\"my_dataset\", table.to_pyarrow_dataset())\nctx.sql(\"select * from my_dataset where v2 > 5\")\n

    Here's the result:

    +-------+-------+--------------+-----+-----+--------+----+----+-----------+\n| id1   | id2   | id3          | id4 | id5 | id6    | v1 | v2 | v3        |\n+-------+-------+--------------+-----+-----+--------+----+----+-----------+\n| id082 | id049 | id0000022715 | 97  | 55  | 756924 | 2  | 11 | 74.161136 |\n| id053 | id052 | id0000113549 | 19  | 56  | 139048 | 1  | 10 | 95.178444 |\n| id090 | id043 | id0000637409 | 94  | 50  | 12448  | 3  | 12 | 60.21896  |\n+-------+-------+--------------+-----+-----+--------+----+----+-----------+\n

    Any query engine that's capable of reading an Arrow table/dataset can read a Delta table.

    "},{"location":"integrations/delta-lake-arrow/#conclusion","title":"Conclusion","text":"

    Delta tables can easily be exposed as Arrow tables/datasets.

    Therefore any query engine that can read an Arrow table/dataset can also read a Delta table.

    Arrow datasets allow for more predicates to be pushed down to the query engine, so they can perform better performance than Arrow tables.

    "},{"location":"integrations/delta-lake-daft/","title":"Using Delta Lake with Daft","text":"

    Daft is a framework for ETL, analytics, and ML/AI at scale with a familiar Python dataframe API, implemented in Rust.

    For Delta Lake users, Daft is a great data processing tool because it gives you the following features:

    • Skipping Filtered Data: Daft implements automatic partition pruning and stats-based file pruning for filter predicates, skipping data that doesn\u2019t need to be read.
    • Multimodal Dataframes: read, write and transform multimodal data incl. images, JSON, PDF, audio, etc.
    • Parallel + Distributed Reads: Daft parallelizes Delta Lake table reads over all cores of your machine, if using the default multithreading runner, or all cores + machines of your Ray cluster, if using the distributed Ray runner.
    • Multi-cloud Support: Daft supports reading Delta Lake tables from AWS S3, Azure Blob Store, and GCS, as well as local files.

    Daft and Delta Lake work really well together. Daft provides unified compute for Delta Lake\u2019s unified storage. Together, Delta Lake and Daft give you high-performance query optimization and distributed compute on massive datasets.

    "},{"location":"integrations/delta-lake-daft/#installing-daft-for-delta-lake","title":"Installing Daft for Delta Lake","text":"

    The easiest way to use Delta Lake format with Daft DataFrames is to install Daft with the [deltalake] extras using pip:

    !pip install -U \"getdaft[deltalake]\"\n

    This adds the deltalake Python package to your install. This package is used to fetch metadata about the Delta Lake table, such as paths to the underlying Parquet files and table statistics. You can of course also install the deltalake manually.

    "},{"location":"integrations/delta-lake-daft/#read-delta-lake-into-a-daft-dataframe","title":"Read Delta Lake into a Daft DataFrame","text":"

    You can easily read Delta Lake tables into a Daft DataFrame using the read_delta_lake method. Let's use it to read some data stored in a Delta Lake on disk. You can access the data stored as a Delta Lake on Github

    import daft\n\n# read delta table into Daft DataFrame\ndf = daft.read_delta_lake(\"path/to/delta_table\")\n

    You can also read in Delta Lake data from remote sources like S3:

    # table_uri = (\n#     \"s3://daft-public-datasets/red-pajamas/\"\n#     \"stackexchange-sample-north-germanic-deltalake\"\n# )\n# df = daft.read_delta_lake(table_uri)\n
    df\n
    first_nameUtf8last_nameUtf8countryUtf8continentUtf8 (No data to display: Dataframe not materialized)

    Daft DataFrames are lazy by default. This means that the contents will not be computed (\"materialized\") unless you explicitly tell Daft to do so. This is best practice for working with larger-than-memory datasets and parallel/distributed architectures.

    The Delta table we have just loaded only has 5 rows. You can materialize it in memory using the .collect method:

    > df.collect()\n\n|    | first_name   | last_name   | country   | continent   |\n|---:|:-------------|:------------|:----------|:------------|\n|  0 | Ernesto      | Guevara     | Argentina | NaN         |\n|  1 | Bruce        | Lee         | China     | Asia        |\n|  2 | Jack         | Ma          | China     | Asia        |\n|  3 | Wolfgang     | Manche      | Germany   | NaN         |\n|  4 | Soraya       | Jala        | Germany   | NaN         |\n
    "},{"location":"integrations/delta-lake-daft/#what-can-i-do-with-a-daft-dataframe","title":"What can I do with a Daft DataFrame?","text":"

    Daft gives you full-featured DataFrame functionality, similar to what you might be used to from pandas, Dask or PySpark.

    On top of this, Daft also gives you:

    • Multimodal data type support to work with Images, URLs, Tensors and more
    • Expressions API for easy column transformations
    • UDFs for multi-column transformation, incl. ML applications

    Let's take a quick look at some of Daft's basic DataFrame operations.

    You can select columns from your DataFrame using the select method. We'll use the show method to show the first n rows (defaults to 10):

    > df.select(\"first_name\", \"country\").show()\n\n|    | first_name   | country   |\n|---:|:-------------|:----------|\n|  0 | Ernesto      | Argentina |\n|  1 | Bruce        | China     |\n|  2 | Jack         | China     |\n|  3 | Wolfgang     | Germany   |\n|  4 | Soraya       | Germany   |\n

    You can sort your Daft DataFrame using the sort method:

    > df.sort(df[\"country\"], desc=True).show()\n\n|    | first_name   | last_name   | country   | continent   |\n|---:|:-------------|:------------|:----------|:------------|\n|  0 | Wolfgang     | Manche      | Germany   | NaN         |\n|  1 | Soraya       | Jala        | Germany   | NaN         |\n|  2 | Bruce        | Lee         | China     | Asia        |\n|  3 | Jack         | Ma          | China     | Asia        |\n|  4 | Ernesto      | Guevara     | Argentina | NaN         |\n

    You can filter your DataFrame using the where method:

    > df.where(df[\"continent\"] == \"Asia\").show()\n\n|    | first_name   | last_name   | country   | continent   |\n|---:|:-------------|:------------|:----------|:------------|\n|  0 | Bruce        | Lee         | China     | Asia        |\n|  1 | Jack         | Ma          | China     | Asia        |\n

    You can group your DataFrame by a specific columns using the groupby method. You can then specify the aggregation method, in this case using the count aggregator:

    > df.select(\"first_name\", \"country\").groupby(df[\"country\"]).count(\"first_name\").show()\n\n|    | country   |   first_name |\n|---:|:----------|-------------:|\n|  0 | Germany   |            2 |\n|  1 | China     |            2 |\n|  2 | Argentina |            1 |\n

    Check out the Daft User Guide for a complete list of DataFrame operations.

    "},{"location":"integrations/delta-lake-daft/#data-skipping-optimizations","title":"Data Skipping Optimizations","text":"

    You may have noticed the Delta Lake warning at the top when we first called collect on our DataFrame:

    WARNING: has partitioning keys = [PartitionField(country#Utf8)], but no partition filter was specified. This will result in a full table scan.

    Delta Lake is informing us that the data is partitioned on the country column.

    Daft's native query optimizer has access to all of the Delta Lake metadata.

    This means it can optimize your query by skipping the partitions that are not relevant for this query. Instead of having to read all 3 partitions, we can read only 1 and get the same result, just faster!

    # Filter on partition columns will result in efficient partition pruning; non-matching partitions will be skipped.\n> df.where(df[\"country\"] == \"Germany\").show()\n\n|    | first_name   | last_name   | country   |   continent |\n|---:|:-------------|:------------|:----------|------------:|\n|  0 | Wolfgang     | Manche      | Germany   |         nan |\n|  1 | Soraya       | Jala        | Germany   |         nan |\n

    You can use the explain method to see how Daft is optimizing your query. Since we've already called collect on our DataFrame, it is already in memory. So below we copy the output of explain(show_all=True) before calling collect:

    Running df.where(df[\"continent\"] == \"Asia\").explain(True) returns:

    (...)\n\n== Optimized Logical Plan ==\n\n* PythonScanOperator: DeltaLakeScanOperator(None)\n|   File schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8\n|   Partitioning keys = [PartitionField(country#Utf8)]\n|   Filter pushdown = col(continent) == lit(\"Asia\")\n|   Output schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8\n\n\n== Physical Plan ==\n\n* TabularScan:\n|   Num Scan Tasks = 3\n|   Estimated Scan Bytes = 3045\n|   Clustering spec = { Num partitions = 3 }\n

    Whereas running df.where(df[\"country\"] == \"Germany\").explain(True) returns:

    (...)\n\n== Optimized Logical Plan ==\n\n* PythonScanOperator: DeltaLakeScanOperator(None)\n|   File schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8\n|   Partitioning keys = [PartitionField(country#Utf8)]\n|   Partition Filter = col(country) == lit(\"Germany\")\n|   Output schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8\n\n\n== Physical Plan ==\n\n* TabularScan:\n|   Num Scan Tasks = 1\n|   Estimated Scan Bytes = 1025\n|   Clustering spec = { Num partitions = 1 }\n

    Running a query on a non-partitioned column like continent will require reading in all partitions, totalling 3045 bytes in this case.

    Instead, running a query on a partitioned column (country in this case) means Daft only has to read only the relevant partition, saving us a whopping 2000+ bytes in this toy example :)

    You can read High-Performance Querying on Massive Delta Lake Tables with Daft for an in-depth benchmarking of query optimization with Delta Lake and Daft.

    "},{"location":"integrations/delta-lake-daft/#transform-columns-with-expressions","title":"Transform columns with Expressions","text":"

    Daft provides a flexible Expressions API for defining computation that needs to happen over your columns.

    For example, we can use daft.col() expressions together with the with_column method to create a new column full_name, joining the contents of the last_name column to the first_name column:

    > df_full = df.with_column(\"full_name\", daft.col('first_name') + ' ' + daft.col('last_name'))\n> df_full.show()\n\n|    | first_name   | last_name   | country   | continent   | full_name       |\n|---:|:-------------|:------------|:----------|:------------|:----------------|\n|  0 | Ernesto      | Guevara     | Argentina | NaN         | Ernesto Guevara |\n|  1 | Bruce        | Lee         | China     | Asia        | Bruce Lee       |\n|  2 | Jack         | Ma          | China     | Asia        | Jack Ma         |\n|  3 | Wolfgang     | Manche      | Germany   | NaN         | Wolfgang Manche |\n|  4 | Soraya       | Jala        | Germany   | NaN         | Soraya Jala     |\n
    "},{"location":"integrations/delta-lake-daft/#multimodal-data-type-support","title":"Multimodal Data Type Support","text":"

    Daft has a rich multimodal type-system with support for Python objects, Images, URLs, Tensors and more.

    Daft columns can contain any Python objects. For example, let's add a column containing a Python class Dog for some of the people in our dataset:

    > import numpy as np\n\n> class Dog:\n>     def __init__(self, name):\n>         self.name = name\n\n>     def bark(self):\n>         return f\"{self.name}!\"\n\n> df_dogs = daft.from_pydict({\n>     'full_name': ['Ernesto Guevara','Bruce Lee','Jack Ma','Wolfgang Manche','Soraya Jala'],\n>     \"dogs\": [Dog(\"ruffles\"), Dog(\"shnoodles\"), Dog(\"waffles\"), Dog(\"doofus\"), Dog(\"Fluffles\")],\n> })\n\n> df_dogs.show()\n\n|    | full_name       | dogs                                 |\n|---:|:----------------|:-------------------------------------|\n|  0 | Ernesto Guevara | <__main__.Dog object at 0x1603d1c10> |\n|  1 | Bruce Lee       | <__main__.Dog object at 0x126ab9b90> |\n|  2 | Jack Ma         | <__main__.Dog object at 0x1603d27d0> |\n|  3 | Wolfgang Manche | <__main__.Dog object at 0x1603d1cd0> |\n|  4 | Soraya Jala     | <__main__.Dog object at 0x1603d3f50> |\n

    You can join this new dogs column to your existing DataFrame using the join method:

    > df_family = df_full.join(df_dogs, on=[\"full_name\"])\n> df_family.show()\n\n|    | full_name       | first_name   | last_name   | country   | continent   | dogs                                 |\n|---:|:----------------|:-------------|:------------|:----------|:------------|:-------------------------------------|\n|  0 | Ernesto Guevara | Ernesto      | Guevara     | Argentina | NaN         | <__main__.Dog object at 0x1603d1c10> |\n|  1 | Bruce Lee       | Bruce        | Lee         | China     | Asia        | <__main__.Dog object at 0x126ab9b90> |\n|  2 | Jack Ma         | Jack         | Ma          | China     | Asia        | <__main__.Dog object at 0x1603d27d0> |\n|  3 | Wolfgang Manche | Wolfgang     | Manche      | Germany   | NaN         | <__main__.Dog object at 0x1603d1cd0> |\n|  4 | Soraya Jala     | Soraya       | Jala        | Germany   | NaN         | <__main__.Dog object at 0x1603d3f50> |\n

    We can then use the apply method to apply a function to each instance of the Dog class:

    > from daft import DataType\n\n> df_family = df_family.with_column(\n>     \"dogs_bark_name\",\n>     df_family[\"dogs\"].apply(lambda dog: dog.bark(), return_dtype=DataType.string()),\n> )\n\n> df_family.show()\n\n|    | first_name   | last_name   | country   | continent   | full_name       | dogs                                 | dogs_bark_name   |\n|---:|:-------------|:------------|:----------|:------------|:----------------|:-------------------------------------|:-----------------|\n|  0 | Ernesto      | Guevara     | Argentina | NaN         | Ernesto Guevara | <__main__.Dog object at 0x1603d1c10> | ruffles!         |\n|  1 | Bruce        | Lee         | China     | Asia        | Bruce Lee       | <__main__.Dog object at 0x126ab9b90> | shnoodles!       |\n|  2 | Jack         | Ma          | China     | Asia        | Jack Ma         | <__main__.Dog object at 0x1603d27d0> | waffles!         |\n|  3 | Wolfgang     | Manche      | Germany   | NaN         | Wolfgang Manche | <__main__.Dog object at 0x1603d1cd0> | doofus!          |\n|  4 | Soraya       | Jala        | Germany   | NaN         | Soraya Jala     | <__main__.Dog object at 0x1603d3f50> | Fluffles!        |\n

    Daft DataFrames can also contain many other data types, like tensors, JSON, URLs and images. The Expressions API provides useful tools to work with these data types.

    Take a look at the notebook in the delta-examples Github repository for a closer look at how Daft handles URLs, images and ML applications.

    "},{"location":"integrations/delta-lake-daft/#transform-multiple-columns-with-udfs","title":"Transform multiple columns with UDFs","text":"

    You can use User-Defined Functions (UDFs) to run functions over multiple rows or columns:

    > from daft import udf\n\n> @udf(return_dtype=DataType.string())\n> def custom_bark(dog_series, owner_series):\n>     return [\n>         f\"{dog.name} loves {owner_name}!\"\n>         for dog, owner_name\n>         in zip(dog_series.to_pylist(), owner_series.to_pylist())\n>     ]\n\n> df_family = df_family.with_column(\"custom_bark\", custom_bark(df_family[\"dogs\"], df_family[\"first_name\"]))\n> df_family.select(\"full_name\", \"dogs_bark_name\", \"custom_bark\").show()\n\n|    | full_name       | dogs_bark_name   | custom_bark            |\n|---:|:----------------|:-----------------|:-----------------------|\n|  0 | Ernesto Guevara | ruffles!         | ruffles loves Ernesto! |\n|  1 | Bruce Lee       | shnoodles!       | shnoodles loves Bruce! |\n|  2 | Jack Ma         | waffles!         | waffles loves Jack!    |\n|  3 | Wolfgang Manche | doofus!          | doofus loves Wolfgang! |\n|  4 | Soraya Jala     | Fluffles!        | Fluffles loves Soraya! |\n

    Daft supports workloads with many more data types than traditional DataFrame APIs.

    By combining multimodal data support with the UDF functionality you can run ML workloads right within your DataFrame.

    "},{"location":"integrations/delta-lake-daft/#when-should-i-use-daft-dataframes","title":"When should I use Daft DataFrames?","text":"

    Daft DataFrames are designed for multimodal, distributed workloads.

    You may want to consider using Daft if you're working with:

    1. Large datasets that don't fit into memory or would benefit from parallelization
    2. Multimodal data types, such as images, JSON, vector embeddings, and tensors
    3. ML workloads that would benefit from interactive computation within DataFrame (via UDFs)

    Take a look at the Daft tutorials for in-depth examples of each use case.

    "},{"location":"integrations/delta-lake-daft/#contribute-to-daft","title":"Contribute to daft","text":"

    Excited about Daft and want to contribute? Join them on Github \ud83d\ude80

    Like many technologies, Daft collects some non-identifiable telemetry to improve the product. This is stricly non-identifiable metadata. You can disable telemetry by setting the following environment variable: DAFT_ANALYTICS_ENABLED=0. Read more in the Daft documentation.

    "},{"location":"integrations/delta-lake-dagster/","title":"Using Delta Lake with Dagster\u00b6","text":"

    Delta Lake is a great storage format for Dagster workflows. This page will explain why and how to use Delta Lake with Dagster.

    You will learn how to use the Delta Lake I/O Manager to read and write your Dagster Software-Defined Assets (SDAs). You will also learn about the unique advantages Delta Lake offers the Dagster community.

    Here are some of the benefits that Delta Lake provides Dagster users: - native PyArrow integration for lazy computation of large datasets, - more efficient querying with file skipping via Z Ordering and liquid clustering - built-in vacuuming to remove unnecessary files and versions - ACID transactions for reliable writes - smooth versioning integration so that versions can be use to trigger downstream updates. - surfacing table stats based on the file statistics

    "},{"location":"integrations/delta-lake-dagster/#dagster-io-managers","title":"Dagster I/O Managers","text":"

    Dagster uses I/O Managers to simplify data reads and writes. I/O Managers help you reduce boilerplate code by storing Dagster Asset and Op outputs and loading them as inputs to downstream objects. They make it easy to change where and how your data is stored.

    You only need to define your I/O Manager and its settings (such as storage location and schema) once and the I/O Manager will take care of correctly reading and writing all your Dagster Assets automatically.

    If you need lower-level access than the Dagster I/O Managers provide, take a look at the Delta Table Resource.

    "},{"location":"integrations/delta-lake-dagster/#the-delta-lake-io-manager","title":"The Delta Lake I/O Manager","text":"

    You can easily read and write Delta Lake Tables from Dagster by using the DeltaLakeIOManager().

    Install the DeltaLakeIOManager:

    pip install dagster-deltalake\n

    Next, configure the following settings in your project\u2019s __init__.py file: - io_manager: set this to DeltaLakeIOManager(), this sets the default I/O Manager for all your Assets

    Within the DeltaLakeIOManager, define: - root_uri: the root path where your Delta Tables will be created - storage_options: configuration for accessing storage location - schema: name of schema to use (optional, defaults to public)

    defs = Definitions(\n   assets=all_assets,\n   resources={\n        \"io_manager\": DeltaLakePyarrowIOManager(\n            root_uri=\"path/to/deltalake\",\n            storage_options=LocalConfig(),\n            schema=\"dagster_deltalake\",\n        ),\n   },\n)\n

    Now, when you materialize an Asset, it will be saved as a Delta Lake in a folder dagster_deltalake/asset_name under the root directory path/to/deltalake.

    The default Delta Lake I/O Manager supports Arrow reads and writes. You can also use the Delta Lake I/O Manager with pandas or polars.

    "},{"location":"integrations/delta-lake-dagster/#creating-delta-lake-tables-with-dagster","title":"Creating Delta Lake Tables with Dagster","text":"

    You don\u2019t need to do anything else to store your Dagster Assets as Delta Lake tables. The I/O Manager will handle storing and loading your Assets as Delta Lake tables from now on.

    You can proceed to write Dagster code as you normally would. For example, you can create an Asset that reads in some toy data about animals and writes it out to an Arrow Table:

    import pyarrow as pa\nfrom pyarrow import csv\n\nfrom dagster import asset\n\n@asset\ndef raw_dataset() -> pa.Table:\n   n_legs = pa.array([2, 4, None, 100])\n   animals = pa.array([\"Flamingo\", \"Horse\", \"Brittle stars\", \"Centipede\"])\n   data = {'n_legs': n_legs, 'animals': animals}\n\n   return pa.Table.from_pydict(data)\n

    When you materialize the Asset defined above (using the config settings defined earlier), the Delta Lake I/O Manager will create the table dagster_deltalake/iris_dataset if it doesn\u2019t exist yet.

    "},{"location":"integrations/delta-lake-dagster/#overwrites-when-rematerializing-assets","title":"Overwrites when Rematerializing Assets","text":"

    If the table does already exist at the specified location, the Delta Lake I/O Manager will perform an overwrite. Delta Lake\u2019s transaction log maintains a record of all changes to your Delta Lake tables. You can inspect the record of changes to your Delta Lake tables by taking a look at these transaction logs.

    "},{"location":"integrations/delta-lake-dagster/#loading-delta-lake-tables-in-downstream-assets","title":"Loading Delta Lake Tables in Downstream Assets","text":"

    You can use Assets stored as Delta Lake tables as input to downstream Assets. Dagster and the Delta Lake I/O Manager make this easy for you.

    You can write Dagster code as you normally would. Pass the upstream Asset as an argument to the downstream object to set up the dependency. Make sure to define the correct data type.

    The Delta Lake I/O Manager will handle reading and writing the data from your Delta Lake.

    import pyarrow as pa\nfrom dagster import asset\n\n# ... raw_dataset asset is defined here ...\n\n@asset\ndef clean_dataset(raw_dataset: pa.Table) -> pa.Table:\n   return raw_dataset.drop_null()\n
    "},{"location":"integrations/delta-lake-dagster/#reading-existing-delta-lake-tables-into-dagster","title":"Reading Existing Delta Lake Tables into Dagster","text":"

    You can make existing Delta Lake tables (that were not created in Dagster) available to your Dagster assets. Use the SourceAsset object and pass the table name as the key argument:

    from dagster import SourceAsset\n\niris_harvest_data = SourceAsset(key=\"more_animal_data\")\n

    This will load a table more_animal_data located at <root_uri>/<schema> as configured in the Definitions object above (see Delta Lake I/O Manager section).

    "},{"location":"integrations/delta-lake-dagster/#column-pruning","title":"Column Pruning","text":"

    You can often improve the efficiency of your computations by only loading specific columns of your Delta table. This is called column pruning.

    With the Delta Lake I/O manager, you can select specific columns to load defining the columns in the metadata parameter of the AssetIn that loads the upstream Asset:

    import pyarrow as pa\nfrom dagster import AssetIn, asset\n\n# this example uses the clean_dataset Asset defined earlier\n\n@asset(\n       ins={\n           \"mammal_bool\": AssetIn(\n               key=\"clean_dataset\",\n               metadata={\"columns\": [\"is_mammal\", \"animals\"]},\n           )\n       }\n)\ndef mammal_data(mammal_bool: pa.Table) -> pa.Table:\n   mammals = mammal_bool[\"is_mammal\"].cast(\"bool\")\n   animals = mammal_bool[\"animals\"]\n   data = {\"mammal_bool\": mammals, \"animals\": animals}\n   return pa.Table.from_pydict(data)\n

    Here, we select only the sepal_length_cm and sepal_width_cm columns from the iris_dataset table and load them into an AssetIn object called iris_sepal. This AssetIn object is used to create a new Asset sepal_data, containing only the selected columns.

    "},{"location":"integrations/delta-lake-dagster/#working-with-partitioned-assets","title":"Working with Partitioned Assets","text":"

    Partitioning is an important feature of Delta Lake that can make your computations more efficient. The Delta Lake I/O manager helps you read and write partitioned data easily. You can work with static partitions, time-based partitions, multi-partitions, and dynamic partitions.

    For example, you can partition the Iris dataset on the species column as follows:

    import pyarrow as pa\n\nfrom dagster import StaticPartitionsDefinition, asset\n\n@asset(\n  partitions_def=StaticPartitionsDefinition(\n      [\"Human\", \"Horse\",]\n  ),\n  metadata={\"partition_expr\": \"n_legs\"},\n)\ndef dataset_partitioned(\n   context,\n   clean_dataset: pa.Table,\n   ) -> pa.Table:\n   animals = context.asset_partition_key_for_output()\n   table = clean_dataset\n\n   return table.filter(pc.field(\"animals\") == animals)\n

    To partition your data, make sure to include the relevant partitions_def and metadata arguments to the @asset decorator. Refer to the Dagster documentation on partitioning assets for more information.

    "},{"location":"integrations/delta-lake-dagster/#using-delta-lake-and-dagster-with-pandas","title":"Using Delta Lake and Dagster with Pandas","text":"

    To read and write data to Delta Lake using pandas, use the DeltaLakePandasIOManager().

    You will need to install it using:

    pip install dagster-deltalake-pandas\n

    In your Definitions object, change the io_manager to DeltaLakePandasIOManager():

    from dagster_deltalake_pandas import DeltaLakePandasIOManager\n\n\ndefs = Definitions(\n   assets=all_assets,\n   resources={\n        \"io_manager\": DeltaLakePandasIOManager(\n            root_uri=\"path/to/deltalake\",\n            storage_options=LocalConfig(),\n            schema=\"dagster_deltalake\",\n        ),\n   },\n)\n

    Now you can read and write Dagster Assets defined as pandas DataFrames in Delta Lake format. For example:

    import pandas as pd\nfrom dagster import asset\n\n@asset\ndef iris_dataset() -> pd.DataFrame:\n   return pd.read_csv(\n       \"https://docs.dagster.io/assets/iris.csv\",\n       names=[\n           \"sepal_length_cm\",\n           \"sepal_width_cm\",\n           \"petal_length_cm\",\n           \"petal_width_cm\",\n           \"species\",\n       ],\n   )\n
    "},{"location":"integrations/delta-lake-dagster/#using-delta-lake-and-dagster-with-polars","title":"Using Delta Lake and Dagster with Polars","text":"

    To read and write data to Delta Lake using pandas, use the DeltaLakePolarsIOManager().

    You will need to install it using:

    pip install dagster-deltalake-polars\n

    In your Definitions object, change the io_manager to DeltaLakePolarsIOManager():

    from dagster_polars import DeltaLakePolarsIOManager\n\ndefs = Definitions(\n   assets=all_assets,\n   resources={\n        \"io_manager\": DeltaLakePolarsIOManager(\n            root_uri=\"path/to/deltalake\",\n            storage_options=LocalConfig(),\n            schema=\"dagster_deltalake\",\n        ),\n   },\n)\n

    Now you can read and write Dagster Assets defined as Polars DataFrames in Delta Lake format. For example:

    import polars as pl\nfrom dagster import asset\n\n\n@asset\ndef iris_dataset() -> pl.DataFrame:\n   return pl.read_csv(\n       \"https://docs.dagster.io/assets/iris.csv\",\n       new_columns=[\n          \"sepal_length_cm\",\n          \"sepal_width_cm\",\n          \"petal_length_cm\",\n          \"petal_width_cm\",\n          \"species\",\n      ],\n   has_header=False\n)\n
    "},{"location":"integrations/delta-lake-dagster/#delta-lake-table-resource","title":"Delta Lake Table Resource","text":"

    I/O managers are a helpful tool in many common usage situations. But when you need lower-level access, the I/O Manager might not be the right tool to use. In these cases you may want to use the Delta Lake Table Resource.

    The Delta Lake Table Resource is a low-level access method to the table object. It gives you more fine-grained control and allows for modeling of more complex data. You can also use the Table Resource to run optimization and vacuuming jobs.

    "},{"location":"integrations/delta-lake-dagster/#schema-and-constraint-enforcement","title":"Schema and Constraint Enforcement","text":"

    Delta Lake provides built-in checks to ensure schema consistency when appending data to a table, as well as the ability to evolve the schema. This is a great feature for the Dagster community as it prevents bad data from being appended to tables, ensuring data consistency and accuracy.

    Read more about how to add constraints to a table in the Delta Lake documentation.

    "},{"location":"integrations/delta-lake-dagster/#z-ordering","title":"Z-Ordering","text":"

    Delta Lake offers Z-ordering functionality to colocate similar data in the same files. This can make your Delta Table queries much more efficient via file skipping. Dagster users can now benefit from this great feature through the Delta Lake I/O Manager.

    Read more about Z-Ordering on the Delta Lake blog.

    "},{"location":"integrations/delta-lake-dagster/#contribute","title":"Contribute","text":"

    To contribute to the Delta Lake and Dagster integration, go to [link]

    "},{"location":"integrations/delta-lake-dask/","title":"Using Delta Lake with Dask","text":"

    Delta Lake is a great storage format for Dask analyses. This page explains why and how to use Delta Lake with Dask.

    You will learn how to read Delta Lakes into Dask DataFrames, how to query Delta tables with Dask, and the unique advantages Delta Lake offers the Dask community.

    Here are some of the benefits that Delta Lake provides Dask users: - better performance with file skipping - enhanced file skipping via Z Ordering - ACID transactions for reliable writes - easy time-travel functionality

    \u2757\ufe0f dask-deltatable currently works with deltalake<=13.0. See https://github.com/dask-contrib/dask-deltatable/issues/65

    "},{"location":"integrations/delta-lake-dask/#install-dask-deltatable","title":"Install Dask-Deltatable","text":"

    To use Delta Lake with Dask, first install the library using

    pip install dask-deltatable\n
    "},{"location":"integrations/delta-lake-dask/#reading-delta-tables-into-a-dask-dataframe","title":"Reading Delta Tables into a Dask DataFrame","text":"

    You can read data stored in a Delta Lake into a Dask DataFrame using dask-deltatable.read_deltalake.

    Let's read in a toy dataset to see what we can do with Delta Lake and Dask. You can access the data stored as a Delta Lake on Github

    import dask_deltatable as ddt\n\n# read delta table into Dask DataFrame\ndelta_path = \"path/to/data/people_countries_delta_dask\"\nddf = ddt.read_deltalake(delta_path)\n

    Dask is a library for efficient distributed computing and works with lazy evaluation. Function calls to dask.dataframe build a task graph in the background. To trigger computation, call .compute():

    > ddf.compute()\n\n|    | first_name   | last_name   | country   | continent   |\n|---:|:-------------|:------------|:----------|:------------|\n|  0 | Ernesto      | Guevara     | Argentina | NaN         |\n|  0 | Bruce        | Lee         | China     | Asia        |\n|  1 | Jack         | Ma          | China     | Asia        |\n|  0 | Wolfgang     | Manche      | Germany   | NaN         |\n|  1 | Soraya       | Jala        | Germany   | NaN         |\n

    You can read in specific versions of Delta tables by specifying a version number or a timestamp:

    # with specific version\nddf = ddt.read_deltalake(delta_path, version=3)\n\n# with specific datetime\nddt.read_deltalake(delta_path, datetime=\"2018-12-19T16:39:57-08:00\")\n

    dask-deltatable also supports reading from remote sources like S3 with:

    ddt.read_deltalake(\"s3://bucket_name/delta_path\", version=3)\n

    To read data from remote sources you'll need to make sure the credentials are properly configured in environment variables or config files. Refer to your cloud provider documentation to configure these.

    "},{"location":"integrations/delta-lake-dask/#what-can-i-do-with-a-dask-deltatable","title":"What can I do with a Dask Deltatable?","text":"

    Reading a Delta Lake in with dask-deltatable returns a regular Dask DataFrame. You can perform all the regular Dask operations on this DataFrame.

    Let's take a look at the first few rows:

    > ddf.head(n=3)\n\n|    | first_name   | last_name   | country   |   continent |\n|---:|:-------------|:------------|:----------|------------:|\n|  0 | Ernesto      | Guevara     | Argentina |         nan |\n

    dask.dataframe.head() shows you the first rows of the first partition in the dataframe. In this case, the first partition only has 1 row.

    This is because the Delta Lake has been partitioned by country:

    > !ls ../../data/people_countries_delta_dask\n_delta_log        country=Argentina country=China     country=Germany\n

    dask-deltatable neatly reads in the partitioned Delta Lake into corresponding Dask DataFrame partitions:

    > # see number of partitions\n> ddf.npartitions\n3\n

    You can inspect a single partition using dask.dataframe.get_partition():

    > ddf.get_partition(n=1).compute()\n\n|    | first_name   | last_name   | country   | continent   |\n|---:|:-------------|:------------|:----------|:------------|\n|  0 | Bruce        | Lee         | China     | Asia        |\n|  1 | Jack         | Ma          | China     | Asia        |\n
    "},{"location":"integrations/delta-lake-dask/#perform-dask-operations","title":"Perform Dask Operations","text":"

    Let's perform some basic computations over the Delta Lake data that's now stored in our Dask DataFrame.

    Suppose you want to group the dataset by the country column:

    > ddf.groupby(['country']).count().compute()\n\n| country   |   first_name |   last_name |   continent |\n|:----------|-------------:|------------:|------------:|\n| Argentina |            1 |           1 |           1 |\n| China     |            2 |           2 |           2 |\n| Germany   |            2 |           2 |           2 |\n

    Dask executes this groupby operation in parallel across all available cores.

    "},{"location":"integrations/delta-lake-dask/#map-functions-over-partitions","title":"Map Functions over Partitions","text":"

    You can also use Dask's map_partitions method to map a custom Python function over all the partitions.

    Let's write a function that will replace the missing continent values with the right continent names.

    # define custom python function\n\n# get na_string\ndf = ddf.get_partition(0).compute()\nna_string = df.iloc[0].continent\nna_string\n\n# define function\ndef replace_proper(partition, na_string):\n    if [partition.country == \"Argentina\"]:\n        partition.loc[partition.country==\"Argentina\"] = partition.loc[partition.country==\"Argentina\"].replace(na_string, \"South America\")\n    if [partition.country == \"Germany\"]:\n        partition.loc[partition.country==\"Germany\"] = partition.loc[partition.country==\"Germany\"].replace(na_string, \"Europe\")\n    else:\n        pass\n    return partition        \n

    Now map this over all partitions in the Dask DataFrame:

    # define metadata and map function over partitions\n> meta = dict(ddf.dtypes)\n> ddf3 = ddf.map_partitions(replace_proper, na_string, meta=meta)\n> ddf3.compute()\n\n|    | first_name   | last_name   | country   | continent     |\n|---:|:-------------|:------------|:----------|:--------------|\n|  0 | Ernesto      | Guevara     | Argentina | South America |\n|  0 | Bruce        | Lee         | China     | Asia          |\n|  1 | Jack         | Ma          | China     | Asia          |\n|  0 | Wolfgang     | Manche      | Germany   | Europe        |\n|  1 | Soraya       | Jala        | Germany   | Europe        |\n
    "},{"location":"integrations/delta-lake-dask/#write-to-delta-lake","title":"Write to Delta Lake","text":"

    After doing your data processing in Dask, you can write the data back out to Delta Lake using to_deltalake:

    ddt.to_deltalake(ddf, \"tmp/test_write\")\n
    "},{"location":"integrations/delta-lake-dask/#contribute-to-dask-deltalake","title":"Contribute to dask-deltalake","text":"

    To contribute, go to the dask-deltalake Github repository.

    "},{"location":"integrations/delta-lake-datafusion/","title":"Using Delta Lake with DataFusion","text":"

    This page explains how to use Delta Lake with DataFusion.

    Delta Lake offers DataFusion users better performance and more features compared to other formats like CSV or Parquet.

    Delta Lake works well with the DataFusion Rust API and the DataFusion Python API. It's a great option for all DataFusion users.

    Delta Lake also depends on DataFusion to implement SQL-related functionality under the hood. We will also discuss this dependency at the end of this guide in case you're interested in learning more about the symbiotic relationship between the two libraries.

    "},{"location":"integrations/delta-lake-datafusion/#delta-lake-performance-benefits-for-datafusion-users","title":"Delta Lake performance benefits for DataFusion users","text":"

    Let's run some DataFusion queries on a Parquet file and a Delta table with the same data to learn more about the performance benefits of Delta Lake.

    Suppose you have the following dataset with 1 billion rows and 9 columns. Here are the first three rows of data:

    +-------+-------+--------------+-------+-------+--------+------+------+---------+\n| id1   | id2   | id3          |   id4 |   id5 |    id6 |   v1 |   v2 |      v3 |\n|-------+-------+--------------+-------+-------+--------+------+------+---------|\n| id016 | id046 | id0000109363 |    88 |    13 | 146094 |    4 |    6 | 18.8377 |\n| id039 | id087 | id0000466766 |    14 |    30 | 111330 |    4 |   14 | 46.7973 |\n| id047 | id098 | id0000307804 |    85 |    23 | 187639 |    3 |    5 | 47.5773 |\n+-------+-------+--------------+-------+-------+--------+------+------+---------+\n

    Here's how to register a Delta Lake table as a PyArrow dataset:

    from datafusion import SessionContext\nfrom deltalake import DeltaTable\n\nctx = SessionContext()\ntable = DeltaTable(\"G1_1e9_1e2_0_0\")\nctx.register_dataset(\"my_delta_table\", table.to_pyarrow_dataset())\n

    Now query the table:

    ctx.sql(\"select id1, sum(v1) as v1 from my_delta_table where id1='id096' group by id1\")\n

    That query takes 2.8 seconds to execute.

    Let's register the same dataset as a Parquet table, run the same query, and compare the runtime difference.

    Register the Parquet table and run the query:

    path = \"G1_1e9_1e2_0_0.parquet\"\nctx.register_parquet(\"my_parquet_table\", path)\nctx.sql(\"select id1, sum(v1) as v1 from my_parquet_table where id1='id096' group by id1\")\n

    This query takes 5.3 seconds to run.

    Parquet stores data in row groups and DataFusion can intelligently skip row groups that don't contain relevant data, so the query is faster than a file format like CSV which doesn't support row group skipping.

    Delta Lake stores file-level metadata information in the transaction log, so it can skip entire files when queries are executed. Delta Lake can skip entire files and then skip row groups within the individual files. This makes Delta Lake even faster than Parquet files, especially for larger datasets spread across many files.

    "},{"location":"integrations/delta-lake-datafusion/#delta-lake-features-for-datafusion-users","title":"Delta Lake features for DataFusion users","text":"

    Delta Lake also provides other features that are useful for DataFusion users like ACID transactions, concurrency protection, time travel, versioned data, and more.

    "},{"location":"integrations/delta-lake-datafusion/#why-delta-lake-depends-on-datafusion","title":"Why Delta Lake depends on DataFusion","text":"

    Delta Lake depends on DataFusion to provide some end-user features.

    DataFusion is useful in providing SQL-related Delta Lake features. Some examples:

    • Update and merge are written in terms of SQL expressions.
    • Invariants and constraints are written in terms of SQL expressions.

    Anytime we have to evaluate SQL, we need some sort of SQL engine. We use DataFusion for that.

    "},{"location":"integrations/delta-lake-datafusion/#conclusion","title":"Conclusion","text":"

    Delta Lake is a great file format for DataFusion users.

    Delta Lake also uses DataFusion to provide some end-user features.

    DataFusion and Delta Lake have a wonderful symbiotic relationship and play very nicely with each other.

    See this guide for more information on Delta Lake and PyArrow and why PyArrow Datasets are often a better option than PyArrow tables.

    "},{"location":"integrations/delta-lake-pandas/","title":"Using Delta Lake with pandas","text":"

    Delta Lake is a great storage system for pandas analyses. This page shows how it's easy to use Delta Lake with pandas, the unique features Delta Lake offers pandas users, and how Delta Lake can make your pandas analyses run faster.

    Delta Lake is very easy to install for pandas analyses, just run pip install deltalake.

    Delta Lake allows for performance optimizations, so pandas queries can run much faster than the query run on data stored in CSV or Parquet. See the following chart for the query runtime for the a Delta tables compared with CSV/Parquet.

    Z Ordered Delta tables run this query much faster than when the data is stored in Parquet or CSV. Let's dive in deeper and see how Delta Lake makes pandas faster.

    "},{"location":"integrations/delta-lake-pandas/#delta-lake-makes-pandas-queries-run-faster","title":"Delta Lake makes pandas queries run faster","text":"

    There are a few reasons Delta Lake can make pandas queries run faster:

    1. column pruning: only grabbing the columns relevant for a query
    2. file skipping: only reading files with data for the query
    3. row group skipping: only reading row groups with data for the query
    4. Z ordering data: colocating similar data in the same files, so file skipping is more effective

    Reading less data (fewer columns and/or fewer rows) is how Delta Lake makes pandas queries run faster.

    Parquet allows for column pruning and row group skipping, but doesn't support file-level skipping or Z Ordering. CSV doesn't support any of these performance optimizations.

    Let's take a look at a sample dataset and run a query to see the performance enhancements offered by Delta Lake.

    Suppose you have a 1 billion row dataset with 9 columns, here are the first three rows of the dataset:

    +-------+-------+--------------+-------+-------+--------+------+------+---------+\n| id1   | id2   | id3          |   id4 |   id5 |    id6 |   v1 |   v2 |      v3 |\n|-------+-------+--------------+-------+-------+--------+------+------+---------|\n| id016 | id046 | id0000109363 |    88 |    13 | 146094 |    4 |    6 | 18.8377 |\n| id039 | id087 | id0000466766 |    14 |    30 | 111330 |    4 |   14 | 46.7973 |\n| id047 | id098 | id0000307804 |    85 |    23 | 187639 |    3 |    5 | 47.5773 |\n+-------+-------+--------------+-------+-------+--------+------+------+---------+\n

    The dataset is roughly 50 GB when stored as an uncompressed CSV files. Let's run some queries on a 2021 Macbook M1 with 64 GB of RAM.

    Start by running the query on an uncompressed CSV file:

    (\n    pd.read_csv(f\"{Path.home()}/data/G1_1e9_1e2_0_0.csv\", usecols=[\"id1\", \"id2\", \"v1\"])\n    .query(\"id1 == 'id016'\")\n    .groupby(\"id2\")\n    .agg({\"v1\": \"sum\"})\n)\n

    This query takes 234 seconds to execute. It runs out of memory if the usecols parameter is not set.

    Now let's convert the CSV dataset to Parquet and run the same query on the data stored in a Parquet file.

    (\n    pd.read_parquet(\n        f\"{Path.home()}/data/G1_1e9_1e2_0_0.parquet\", columns=[\"id1\", \"id2\", \"v1\"]\n    )\n    .query(\"id1 == 'id016'\")\n    .groupby(\"id2\")\n    .agg({\"v1\": \"sum\"})\n)\n

    This query takes 118 seconds to execute.

    Parquet stores data in row groups and allows for skipping when the filters predicates are set. Run the Parquet query again with row group skipping enabled:

    (\n    pd.read_parquet(\n        f\"{Path.home()}/data/G1_1e9_1e2_0_0.parquet\",\n        columns=[\"id1\", \"id2\", \"v1\"],\n        filters=[(\"id1\", \"==\", \"id016\")],\n    )\n    .query(\"id1 == 'id016'\")\n    .groupby(\"id2\")\n    .agg({\"v1\": \"sum\"})\n)\n

    This query runs in 19 seconds. Lots of row groups can be skipped for this particular query.

    Now let's run the same query on a Delta table to see the out-of-the box performance:

    (\n    DeltaTable(f\"{Path.home()}/data/deltalake_baseline_G1_1e9_1e2_0_0\", version=0)\n    .to_pandas(filters=[(\"id1\", \"==\", \"id016\")], columns=[\"id1\", \"id2\", \"v1\"])\n    .query(\"id1 == 'id016'\")\n    .groupby(\"id2\")\n    .agg({\"v1\": \"sum\"})\n)\n

    This query runs in 8 seconds, which is a significant performance enhancement.

    Now let's Z Order the Delta table by id1 which will make the data skipping even better. Run the query again on the Z Ordered Delta table:

    (\n    DeltaTable(f\"{Path.home()}/data/deltalake_baseline_G1_1e9_1e2_0_0\", version=1)\n    .to_pandas(filters=[(\"id1\", \"==\", \"id016\")], columns=[\"id1\", \"id2\", \"v1\"])\n    .query(\"id1 == 'id016'\")\n    .groupby(\"id2\")\n    .agg({\"v1\": \"sum\"})\n)\n

    The query now executes in 2.4 seconds.

    Delta tables can make certain pandas queries run much faster.

    "},{"location":"integrations/delta-lake-pandas/#delta-lake-lets-pandas-users-time-travel","title":"Delta Lake lets pandas users time travel","text":"

    Start by creating a Delta table:

    from deltalake import write_deltalake, DeltaTable\n\ndf = pd.DataFrame({\"num\": [1, 2, 3], \"letter\": [\"a\", \"b\", \"c\"]})\nwrite_deltalake(\"tmp/some-table\", df)\n

    Here are the contents of the Delta table (version 0 of the Delta table):

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n+-------+----------+\n

    Now append two rows to the Delta table:

    df = pd.DataFrame({\"num\": [8, 9], \"letter\": [\"dd\", \"ee\"]})\nwrite_deltalake(\"tmp/some-table\", df, mode=\"append\")\n

    Here are the contents after the append operation (version 1 of the Delta table):

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n|     8 | dd       |\n|     9 | ee       |\n+-------+----------+\n

    Now perform an overwrite transaction:

    df = pd.DataFrame({\"num\": [11, 22], \"letter\": [\"aa\", \"bb\"]})\nwrite_deltalake(\"tmp/some-table\", df, mode=\"overwrite\")\n

    Here are the contents after the overwrite operation (version 2 of the Delta table):

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     8 | dd       |\n|     9 | ee       |\n+-------+----------+\n

    Read in the Delta table and it will grab the latest version by default:

    DeltaTable(\"tmp/some-table\").to_pandas()\n\n+-------+----------+\n|   num | letter   |\n|-------+----------|\n|    11 | aa       |\n|    22 | bb       |\n+-------+----------+\n

    You can easily time travel back to version 0 of the Delta table:

    DeltaTable(\"tmp/some-table\", version=0).to_pandas()\n\n+-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n+-------+----------+\n

    You can also time travel to version 1 of the Delta table:

    DeltaTable(\"tmp/some-table\", version=1).to_pandas()\n\n+-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n|     8 | dd       |\n|     9 | ee       |\n+-------+----------+\n

    Time travel is a powerful feature that pandas users cannot access with CSV or Parquet.

    "},{"location":"integrations/delta-lake-pandas/#schema-enforcement","title":"Schema enforcement","text":"

    Delta tables only allow you to append DataFrame with matching schema by default. Suppose you have a DataFrame with num and animal columns, which is different from the Delta table that has columns with num and letter columns.

    Try to append this DataFrame with a mismatched schema to the existing table:

    df = pd.DataFrame({\"num\": [5, 6], \"animal\": [\"cat\", \"dog\"]})\nwrite_deltalake(\"tmp/some-table\", df)\n

    This transaction will be rejected and will return the following error message:

    ValueError: Schema of data does not match table schema\nData schema:\nnum: int64\nanimal: string\n-- schema metadata --\npandas: '{\"index_columns\": [{\"kind\": \"range\", \"name\": null, \"start\": 0, \"' + 474\nTable Schema:\nnum: int64\nletter: string\n

    Schema enforcement protects your table from getting corrupted by appending data with mismatched schema. Parquet and CSV don't offer schema enforcement for pandas users.

    "},{"location":"integrations/delta-lake-pandas/#overwriting-schema-of-table","title":"Overwriting schema of table","text":"

    You can overwrite the table contents and schema by setting the schema_mode option. Here's how to overwrite the table contents:

    write_deltalake(\"tmp/some-table\", df, mode=\"overwrite\", schema_mode=\"overwrite\")\n

    Here are the contents of the table after the values and schema have been overwritten:

    +-------+----------+\n|   num | animal   |\n|-------+----------|\n|     5 | cat      |\n|     6 | dog      |\n+-------+----------+\n

    If you want the schema to be merged instead, specify schema_mode=\"merge\".

    "},{"location":"integrations/delta-lake-pandas/#in-memory-vs-in-storage-data-changes","title":"In-memory vs. in-storage data changes","text":"

    It's important to distinguish between data stored in-memory and data stored on disk when understanding the functionality offered by Delta Lake.

    pandas loads data from storage (CSV, Parquet, or Delta Lake) into in-memory DataFrames.

    pandas makes it easy to modify the data in memory, say update a column value. It's not easy to update a column value in storage systems like CSV or Parquet using pandas.

    Delta Lake makes it easy for pandas users to update data in storage.

    "},{"location":"integrations/delta-lake-pandas/#why-delta-lake-allows-for-faster-queries","title":"Why Delta Lake allows for faster queries","text":"

    Delta tables store data in many files and metadata about the files in the transaction log. Delta Lake allows for certain queries to skip entire files, which makes pandas queries run much faster.

    "},{"location":"integrations/delta-lake-pandas/#more-resources","title":"More resources","text":"

    See this talk on why Delta Lake is the best file format for pandas analyses to learn more:

    "},{"location":"integrations/delta-lake-pandas/#conclusion","title":"Conclusion","text":"

    Delta Lake provides many features that make it an excellent format for pandas analyses:

    • performance optimizations make pandas queries run faster
    • data management features make pandas analyses more reliable
    • advanced features allow you to perform more complex pandas analyses

    Python deltalake offers pandas users a better experience compared with CSV/Parquet.

    "},{"location":"integrations/delta-lake-polars/","title":"Using Delta Lake with polars","text":"

    This page explains why Delta Lake is a great storage system for Polars analyses.

    You will learn how to create Delta tables with Polars, how to query Delta tables with Polars, and the unique advantages Delta Lake offers the Polars community.

    Here are some amazing benefits that Delta Lake provides Polars users:

    • time travel
    • ACID transactions for reliable writes
    • better performance with file skipping
    • enhanced file skipping via Z Ordering
    • ability to rollback mistakes
    • and many, many more

    Let's start by showing how to use Polars with Delta Lake, explore how Delta Lake can make Polars queries run faster, and then look at all the cool features Delta Lake offers Polars users.

    "},{"location":"integrations/delta-lake-polars/#creating-a-delta-lake-table-with-polars","title":"Creating a Delta Lake table with Polars","text":"

    Create a Polars DataFrame and write it out to a Delta table:

    import polars as pl\n\ndf = pl.DataFrame({\"x\": [1, 2, 3]})\ndf.write_delta(\"tmp/bear_delta_lake\")\n

    Inspect the contents of the Delta table:

    print(pl.read_delta(\"tmp/bear_delta_lake\"))\n\n+-----+\n| x   |\n| --- |\n| i64 |\n+=====+\n| 1   |\n| 2   |\n| 3   |\n+-----+\n

    Now create another Polars DataFrame and append it to the existing Delta table:

    df2 = pl.DataFrame({\"x\": [8, 9, 10]})\ndf2.write_delta(\"tmp/bear_delta_lake\", mode=\"append\")\n

    Re-inspect the contents of the Delta table:

    print(pl.read_delta(\"tmp/bear_delta_lake\"))\n\n+-----+\n| x   |\n| --- |\n| i64 |\n+=====+\n| 1   |\n| 2   |\n| 3   |\n| 8   |\n| 9   |\n| 10  |\n+-----+\n

    Now overwrite the existing Delta table:

    df3 = pl.DataFrame({\"x\": [55, 66, 77]})\ndf3.write_delta(\"tmp/bear_delta_lake\", mode=\"overwrite\")\n

    Inspect the Delta table:

    print(pl.read_delta(\"tmp/bear_delta_lake\"))\n\n+-----+\n| x   |\n| --- |\n| i64 |\n+=====+\n| 55  |\n| 66  |\n| 77  |\n+-----+\n

    The Delta table now has three versions, as shown in the following diagram:

    "},{"location":"integrations/delta-lake-polars/#time-travel-with-delta-lake-for-polars","title":"Time travel with Delta Lake for Polars","text":"

    Time travel back to version 0 of the Delta table:

    print(pl.read_delta(\"tmp/bear_delta_lake\", version=0))\n\n+-----+\n| x   |\n| --- |\n| i64 |\n+=====+\n| 1   |\n| 2   |\n| 3   |\n+-----+\n

    Time travel back to version 1 of the Delta table:

    print(pl.read_delta(\"tmp/bear_delta_lake\", version=1))\n\n+-----+\n| x   |\n| --- |\n| i64 |\n+=====+\n| 1   |\n| 2   |\n| 3   |\n| 9   |\n| 8   |\n| 10  |\n+-----+\n

    Read the Delta table without specifying a version and see how it reads the latest version by default:

    print(pl.read_delta(\"tmp/bear_delta_lake\"))\n\n+-----+\n| x   |\n| --- |\n| i64 |\n+=====+\n| 55  |\n| 66  |\n| 77  |\n+-----+\n

    Let's dive into how to read Delta tables with Polars in more detail and compare the query runtime performance on larger datasets.

    "},{"location":"integrations/delta-lake-polars/#reading-a-delta-lake-table-with-polars","title":"Reading a Delta Lake table with Polars","text":"

    Let's look at the h2o groupby dataset that has 1 billion rows and 9 columns. Here are the first three rows of the dataset:

    +-------+-------+--------------+-------+-------+--------+------+------+---------+\n| id1   | id2   | id3          |   id4 |   id5 |    id6 |   v1 |   v2 |      v3 |\n|-------+-------+--------------+-------+-------+--------+------+------+---------|\n| id016 | id046 | id0000109363 |    88 |    13 | 146094 |    4 |    6 | 18.8377 |\n| id039 | id087 | id0000466766 |    14 |    30 | 111330 |    4 |   14 | 46.7973 |\n| id047 | id098 | id0000307804 |    85 |    23 | 187639 |    3 |    5 | 47.5773 |\n+-------+-------+--------------+-------+-------+--------+------+------+---------+\n

    This dataset is 50GB when stored in an uncompressed CSV file. Let's run some queries on this dataset when it's stored in different file formats with Polars.

    This section will show the runtime for a query when the data is stored in CSV, Parquet, and Delta Lake and explain why Delta tables are the fastest.

    Start by running a query on an uncompressed CSV file with read_csv:

    pl.read_csv(\"~/data/G1_1e9_1e2_0_0.csv\").filter(pl.col(\"id1\") < \"id016\").group_by(\n    [\"id1\", \"id2\"]\n).agg(pl.sum(\"v1\").alias(\"v1_sum\")).collect()\n

    This query errors out after running for several minutes. The machine runs out of memory. Let's try it again with scan_csv.

    pl.scan_csv(\"~/data/G1_1e9_1e2_0_0.csv\").filter(pl.col(\"id1\") < \"id016\").group_by(\n    [\"id1\", \"id2\"]\n).agg(pl.sum(\"v1\").alias(\"v1_sum\")).collect()\n

    This query runs in 56.2 seconds.

    Now let's run the same query when the data is stored in a Parquet file:

    pl.scan_parquet(\"~/data/G1_1e9_1e2_0_0.parquet\").filter(\n    pl.col(\"id1\") < \"id016\"\n).group_by([\"id1\", \"id2\"]).agg(pl.sum(\"v1\").alias(\"v1_sum\")).collect()\n

    This query runs in 8.3 seconds. It's much faster because Polars is optimized to skip row groups in Parquet files that don't contain data that's relevant for the query.

    Then run the query on newly created Delta table:

    pl.scan_delta(\"~/data/deltalake/G1_1e9_1e2_0_0\", version=1).filter(\n    pl.col(\"id1\") < \"id016\"\n).group_by([\"id1\", \"id2\"]).agg(pl.sum(\"v1\").alias(\"v1_sum\")).collect()\n

    This query runs in 7.2 seconds. Polars can run this query faster because it can inspect the Delta transaction log and skip entire files that don't contain relevant data before performing the ordinary Parquet row group skipping.

    Finally run the query on the Delta table after it has been Z Ordered by id1:

    pl.scan_delta(\"~/data/deltalake/G1_1e9_1e2_0_0\", version=2).filter(\n    pl.col(\"id1\") < \"id016\"\n).group_by([\"id1\", \"id2\"]).agg(pl.sum(\"v1\").alias(\"v1_sum\")).collect()\n

    This query runs in 3.5 seconds. The query on the Z Ordered Delta table is even faster because similar data has been co-located in the same files. This allows for even greater data skipping.

    Polars can leverage file skipping to query Delta tables very quickly.

    "},{"location":"integrations/delta-lake-polars/#why-polars-is-fast-with-delta-lake","title":"Why Polars is fast with Delta Lake","text":"

    Delta tables consist of metadata in a transaction log and data stored in Parquet files.

    When Polars queries a Delta table, it starts by consulting the transaction log to understand the metadata of each file in the Delta table. This allows for Polars to quickly identify which files should be skipped by the query.

    CSV files don't contain any such metadata, so file skipping isn't an option. Polars can skip Parquet files based on metadata, but it needs to open up each file and read the metadata, which is slower that grabbing the file-level metadata directly from the transaction log.

    Parquet doesn't allow users to easily Z Order the data and colocate similar data in the same row groups. The Z Order optimizations are only supported in Delta tables.

    Delta Lake offers Polars users with unique performance optimizations.

    "},{"location":"integrations/delta-lake-polars/#other-delta-lake-features-relevant-for-polars-users","title":"Other Delta Lake features relevant for Polars users","text":"
    • ACID transactions for reliable writes
    • better performance with file skipping
    • enhanced file skipping via Z Ordering
    • ability to rollback mistakes
    "},{"location":"integrations/delta-lake-polars/#conclusion","title":"Conclusion","text":"

    This guide shows how Delta Lake is a great storage format for Polars analyses.

    Delta Lake is easy to use, fast, and full of features that are great for Polars users.

    "},{"location":"usage/appending-overwriting-delta-lake-table/","title":"Appending to and overwriting a Delta Lake table","text":"

    This section explains how to append to an exising Delta table and how to overwrite a Delta table.

    "},{"location":"usage/appending-overwriting-delta-lake-table/#delta-lake-append-transactions","title":"Delta Lake append transactions","text":"

    Suppose you have a Delta table with the following contents:

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n+-------+----------+\n

    Append two additional rows of data to the table:

    from deltalake import write_deltalake, DeltaTable\n\ndf = pd.DataFrame({\"num\": [8, 9], \"letter\": [\"dd\", \"ee\"]})\nwrite_deltalake(\"tmp/some-table\", df, mode=\"append\")\n

    Here are the updated contents of the Delta table:

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n|     8 | dd       |\n|     9 | ee       |\n+-------+----------+\n

    Now let's see how to perform an overwrite transaction.

    "},{"location":"usage/appending-overwriting-delta-lake-table/#delta-lake-overwrite-transactions","title":"Delta Lake overwrite transactions","text":"

    Now let's see how to overwrite the exisitng Delta table.

    df = pd.DataFrame({\"num\": [11, 22], \"letter\": [\"aa\", \"bb\"]})\nwrite_deltalake(\"tmp/some-table\", df, mode=\"overwrite\")\n

    Here are the contents of the Delta table after the overwrite operation:

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|    11 | aa       |\n|    22 | bb       |\n+-------+----------+\n

    Overwriting just performs a logical delete. It doesn't physically remove the previous data from storage. Time travel back to the previous version to confirm that the old version of the table is still accessable.

    dt = DeltaTable(\"tmp/some-table\", version=1)\n\n+-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n|     8 | dd       |\n|     9 | ee       |\n+-------+----------+\n
    "},{"location":"usage/constraints/","title":"Adding a Constraint to a table","text":"

    Check constraints are a way to enforce that only data that meets the constraint is allowed to be added to the table.

    "},{"location":"usage/constraints/#add-the-constraint","title":"Add the Constraint","text":"Python Rust

    DeltaTable

    from deltalake import DeltaTable\n\ndt = DeltaTable(\"../rust/tests/data/simple_table\")\n\n# Check the schema before hand\nprint(dt.schema())\n# Add the constraint to the table.\ndt.alter.add_constraint({\"id_gt_0\": \"id > 0\"})\n

    DeltaTable

    let table = deltalake::open_table(\"../rust/tests/data/simple_table\").await?;\nlet ops = DeltaOps(table);\nops.with_constraint(\"id_gt_0\", \"id > 0\").await?;\n

    After you have added the constraint to the table attempting to append data to the table that violates the constraint will instead throw an error.

    "},{"location":"usage/constraints/#verify-the-constraint-by-trying-to-add-some-data","title":"Verify the constraint by trying to add some data","text":"Python Rust
    from deltalake import write_deltalake, DeltaTable\nimport pandas as pd\n\ndt = DeltaTable(\"../rust/tests/data/simple_table\")\n\ndf = pd.DataFrame({\"id\": [-1]})\nwrite_deltalake(dt, df, mode=\"append\", engine=\"rust\")\n# _internal.DeltaProtocolError: Invariant violations: [\"Check or Invariant (id > 0) violated by value in row: [-1]\"]\n
    let table = deltalake::open_table(\"../rust/tests/data/simple_table\").await?;\nlet schema = table.get_state().arrow_schema()?;\nlet invalid_values: Vec<Arc<dyn Array>> = vec![\n    Arc::new(Int32Array::from(vec![-10]))\n];\nlet batch = RecordBatch::try_new(schema, invalid_values)?;\ntable.write(vec![batch]).await?;\n

    Note: ensure you use the engine='rust' parameter when writing to the table as this feature is not supported in the default pyarrow writer.

    "},{"location":"usage/create-delta-lake-table/","title":"Creating a Delta Lake Table","text":"

    This section explains how to create a Delta Lake table.

    You can easily write a DataFrame to a Delta table.

    pandasPolars
    from deltalake import write_deltalake\nimport pandas as pd\n\ndf = pd.DataFrame({\"num\": [1, 2, 3], \"letter\": [\"a\", \"b\", \"c\"]})\nwrite_deltalake(\"tmp/some-table\", df)\n
    import polars as pl\n\ndf = pl.DataFrame({\"num\": [1, 2, 3], \"letter\": [\"a\", \"b\", \"c\"]})\ndf.write_delta(\"tmp/some-table\")\n

    Here are the contents of the Delta table in storage:

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n+-------+----------+\n
    "},{"location":"usage/deleting-rows-from-delta-lake-table/","title":"Deleting rows from a Delta Lake table","text":"

    This section explains how to delete rows from a Delta Lake table.

    Suppose you have the following Delta table with four rows:

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n|     4 | d        |\n+-------+----------+\n

    Here's how to delete all the rows where the num is greater than 2:

    dt = DeltaTable(\"tmp/my-table\")\ndt.delete(\"num > 2\")\n

    Here are the contents of the Delta table after the delete operation has been performed:

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n+-------+----------+\n
    "},{"location":"usage/examining-table/","title":"Examining a Table","text":""},{"location":"usage/examining-table/#metadata","title":"Metadata","text":"

    The delta log maintains basic metadata about a table, including:

    • A unique id
    • A name, if provided
    • A description, if provided
    • The list of partitionColumns.
    • The created_time of the table
    • A map of table configuration. This includes fields such as delta.appendOnly, which if true indicates the table is not meant to have data deleted from it.

    Get metadata from a table with the DeltaTable.metadata() method:

    >>> from deltalake import DeltaTable\n>>> dt = DeltaTable(\"../rust/tests/data/simple_table\")\n>>> dt.metadata()\nMetadata(id: 5fba94ed-9794-4965-ba6e-6ee3c0d22af9, name: None, description: None, partitionColumns: [], created_time: 1587968585495, configuration={})\n
    "},{"location":"usage/examining-table/#schema","title":"Schema","text":"

    The schema for the table is also saved in the transaction log. It can either be retrieved in the Delta Lake form as Schema or as a PyArrow schema. The first allows you to introspect any column-level metadata stored in the schema, while the latter represents the schema the table will be loaded into.

    Use DeltaTable.schema to retrieve the delta lake schema:

    >>> from deltalake import DeltaTable\n>>> dt = DeltaTable(\"../rust/tests/data/simple_table\")\n>>> dt.schema()\nSchema([Field(id, PrimitiveType(\"long\"), nullable=True)])\n

    These schemas have a JSON representation that can be retrieved. To reconstruct from json, use DeltaTable.schema.to_json().

    >>> dt.schema().to_json()\n'{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]}'\n

    Use DeltaTable.schema.to_pyarrow() to retrieve the PyArrow schema:

    >>> dt.schema().to_pyarrow()\nid: int64\n
    "},{"location":"usage/examining-table/#history","title":"History","text":"

    Depending on what system wrote the table, the delta table may have provenance information describing what operations were performed on the table, when, and by whom. This information is retained for 30 days by default, unless otherwise specified by the table configuration delta.logRetentionDuration.

    Note

    This information is not written by all writers and different writers may use different schemas to encode the actions. For Spark\\'s format, see: https://docs.delta.io/latest/delta-utility.html#history-schema

    To view the available history, use DeltaTable.history:

    from deltalake import DeltaTable\n\ndt = DeltaTable(\"../rust/tests/data/simple_table\")\ndt.history()\n
    [{'timestamp': 1587968626537, 'operation': 'DELETE', 'operationParameters': {'predicate': '[\"((`id` % CAST(2 AS BIGINT)) = CAST(0 AS BIGINT))\"]'}, 'readVersion': 3, 'isBlindAppend': False},\n {'timestamp': 1587968614187, 'operation': 'UPDATE', 'operationParameters': {'predicate': '((id#697L % cast(2 as bigint)) = cast(0 as bigint))'}, 'readVersion': 2, 'isBlindAppend': False},\n {'timestamp': 1587968604143, 'operation': 'WRITE', 'operationParameters': {'mode': 'Overwrite', 'partitionBy': '[]'}, 'readVersion': 1, 'isBlindAppend': False},\n {'timestamp': 1587968596254, 'operation': 'MERGE', 'operationParameters': {'predicate': '(oldData.`id` = newData.`id`)'}, 'readVersion': 0, 'isBlindAppend': False},\n {'timestamp': 1587968586154, 'operation': 'WRITE', 'operationParameters': {'mode': 'ErrorIfExists', 'partitionBy': '[]'}, 'isBlindAppend': True}]\n
    "},{"location":"usage/examining-table/#current-add-actions","title":"Current Add Actions","text":"

    The active state for a delta table is determined by the Add actions, which provide the list of files that are part of the table and metadata about them, such as creation time, size, and statistics. You can get a data frame of the add actions data using DeltaTable.get_add_actions:

    >>> from deltalake import DeltaTable\n>>> dt = DeltaTable(\"../rust/tests/data/delta-0.8.0\")\n>>> dt.get_add_actions(flatten=True).to_pandas()\n                                                    path  size_bytes   modification_time  data_change  num_records  null_count.value  min.value  max.value\n0  part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a...         440 2021-03-06 15:16:07         True            2                 0          0          2\n1  part-00000-04ec9591-0b73-459e-8d18-ba5711d6cbe...         440 2021-03-06 15:16:16         True            2                 0          2          4\n

    This works even with past versions of the table:

    >>> dt = DeltaTable(\"../rust/tests/data/delta-0.8.0\", version=0)\n>>> dt.get_add_actions(flatten=True).to_pandas()\n                                                path  size_bytes   modification_time  data_change  num_records  null_count.value  min.value  max.value\n0  part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a...         440 2021-03-06 15:16:07         True            2                 0          0          2\n1  part-00001-911a94a2-43f6-4acb-8620-5e68c265498...         445 2021-03-06 15:16:07         True            3                 0          2          4\n
    "},{"location":"usage/installation/","title":"Installation","text":"

    The deltalake project can be installed via pip for Python or Cargo for Rust.

    "},{"location":"usage/installation/#install-delta-lake-for-python","title":"Install Delta Lake for Python","text":"

    With pip:

    pip install deltalake\n

    With Conda:

    conda install -c conda-forge deltalake\n
    "},{"location":"usage/installation/#install-delta-lake-for-rust","title":"Install Delta Lake for Rust","text":"

    With Cargo:

    cargo add deltalake\n
    "},{"location":"usage/installation/#run-delta-lake-and-pandas-in-a-jupyter-notebook","title":"Run Delta Lake and pandas in a Jupyter Notebook","text":"

    You can easily run Delta Lake and pandas in a Jupyter notebook.

    Create an environment file with the dependencies as follows:

    name: deltalake-minimal\nchannels:\n  - conda-forge\n  - defaults\ndependencies:\n  - python=3.11\n  - ipykernel\n  - pandas\n  - polars\n  - jupyterlab\n  - pip\n  - pip:\n    - deltalake\n

    Create a virtual environment with the dependencies:

    conda env create -f deltalake-minimal.yml\n

    Open the Jupyter notebook and run commands as follows:

    "},{"location":"usage/loading-table/","title":"Loading a Delta Table","text":"

    A DeltaTable represents the state of a delta table at a particular version. This includes which files are currently part of the table, the schema of the table, and other metadata such as creation time.

    Python Rust

    DeltaTable

    from deltalake import DeltaTable\n\ndt = DeltaTable(\"../rust/tests/data/delta-0.2.0\")\nprint(f\"Version: {dt.version()}\")\nprint(f\"Files: {dt.files()}\")\n

    DeltaTable

    let table = deltalake::open_table(\"../rust/tests/data/simple_table\").await.unwrap();\nprintln!(\"Version: {}\", table.version());\nprintln!(\"Files: {}\", table.get_files());\n

    Depending on your storage backend, you could use the storage_options parameter to provide some configuration. Configuration is defined for specific backends - s3 options, azure options, gcs options.

    >>> storage_options = {\"AWS_ACCESS_KEY_ID\": \"THE_AWS_ACCESS_KEY_ID\", \"AWS_SECRET_ACCESS_KEY\":\"THE_AWS_SECRET_ACCESS_KEY\"}\n>>> dt = DeltaTable(\"../rust/tests/data/delta-0.2.0\", storage_options=storage_options)\n

    The configuration can also be provided via the environment, and the basic service provider is derived from the URL being used. We try to support many of the well-known formats to identify basic service properties.

    S3:

    • s3://\\<bucket>/\\<path>
    • s3a://\\<bucket>/\\<path>

    Azure:

    • az://\\<container>/\\<path>
    • adl://\\<container>/\\<path>
    • abfs://\\<container>/\\<path>

    GCS:

    • gs://\\<bucket>/\\<path>

    Alternatively, if you have a data catalog you can load it by reference to a database and table name. Currently only AWS Glue is supported.

    For AWS Glue catalog, use AWS environment variables to authenticate.

    >>> from deltalake import DeltaTable\n>>> from deltalake import DataCatalog\n>>> database_name = \"simple_database\"\n>>> table_name = \"simple_table\"\n>>> data_catalog = DataCatalog.AWS\n>>> dt = DeltaTable.from_data_catalog(data_catalog=data_catalog, database_name=database_name, table_name=table_name)\n>>> dt.to_pyarrow_table().to_pydict()\n{'id': [5, 7, 9, 5, 6, 7, 8, 9]}\n
    "},{"location":"usage/loading-table/#custom-storage-backends","title":"Custom Storage Backends","text":"

    While delta always needs its internal storage backend to work and be properly configured, in order to manage the delta log, it may sometime be advantageous - and is common practice in the arrow world - to customize the storage interface used for reading the bulk data.

    deltalake will work with any storage compliant with pyarrow.fs.FileSystem, however the root of the filesystem has to be adjusted to point at the root of the Delta table. We can achieve this by wrapping the custom filesystem into a pyarrow.fs.SubTreeFileSystem.

    import pyarrow.fs as fs\nfrom deltalake import DeltaTable\n\npath = \"<path/to/table>\"\nfilesystem = fs.SubTreeFileSystem(path, fs.LocalFileSystem())\n\ndt = DeltaTable(path)\nds = dt.to_pyarrow_dataset(filesystem=filesystem)\n

    When using the pyarrow factory method for file systems, the normalized path is provided on creation. In case of S3 this would look something like:

    import pyarrow.fs as fs\nfrom deltalake import DeltaTable\n\ntable_uri = \"s3://<bucket>/<path>\"\nraw_fs, normalized_path = fs.FileSystem.from_uri(table_uri)\nfilesystem = fs.SubTreeFileSystem(normalized_path, raw_fs)\n\ndt = DeltaTable(table_uri)\nds = dt.to_pyarrow_dataset(filesystem=filesystem)\n
    "},{"location":"usage/loading-table/#time-travel","title":"Time Travel","text":"

    To load previous table states, you can provide the version number you wish to load:

    >>> dt = DeltaTable(\"../rust/tests/data/simple_table\", version=2)\n

    Once you\\'ve loaded a table, you can also change versions using either a version number or datetime string:

    >>> dt.load_version(1)\n>>> dt.load_with_datetime(\"2021-11-04 00:05:23.283+00:00\")\n

    Warning

    Previous table versions may not exist if they have been vacuumed, in which case an exception will be thrown. See Vacuuming tables for more information.

    "},{"location":"usage/managing-tables/","title":"Managing Delta Tables","text":""},{"location":"usage/managing-tables/#vacuuming-tables","title":"Vacuuming tables","text":"

    Vacuuming a table will delete any files that have been marked for deletion. This may make some past versions of a table invalid, so this can break time travel. However, it will save storage space. Vacuum will retain files in a certain window, by default one week, so time travel will still work in shorter ranges.

    Delta tables usually don't delete old files automatically, so vacuuming regularly is considered good practice, unless the table is only appended to.

    Use DeltaTable.vacuum to perform the vacuum operation. Note that to prevent accidental deletion, the function performs a dry-run by default: it will only list the files to be deleted. Pass dry_run=False to actually delete files.

    >>> dt = DeltaTable(\"../rust/tests/data/simple_table\")\n>>> dt.vacuum()\n['../rust/tests/data/simple_table/part-00006-46f2ff20-eb5d-4dda-8498-7bfb2940713b-c000.snappy.parquet',\n '../rust/tests/data/simple_table/part-00190-8ac0ae67-fb1d-461d-a3d3-8dc112766ff5-c000.snappy.parquet',\n '../rust/tests/data/simple_table/part-00164-bf40481c-4afd-4c02-befa-90f056c2d77a-c000.snappy.parquet',\n ...]\n>>> dt.vacuum(dry_run=False) # Don't run this unless you are sure!\n
    "},{"location":"usage/managing-tables/#optimizing-tables","title":"Optimizing tables","text":"

    Optimizing tables is not currently supported.

    "},{"location":"usage/overview/","title":"Usage","text":"

    This guide teaches you how to use Delta Lake. You will learn how to create Delta tables, run queries, perform DML operations, and optimize your tables.

    It's easy to use Delta Lake with pandas, Polars, Rust, or any other PyArrow-like DataFrame library.

    See the Spark Delta Lake documentation if you're using Delta Lake with Spark.

    "},{"location":"usage/querying-delta-tables/","title":"Querying Delta Tables","text":"

    Delta tables can be queried in several ways. By loading as Arrow data or an Arrow dataset, they can be used by compatible engines such as Pandas and DuckDB. By passing on the list of files, they can be loaded into other engines such as Dask.

    Delta tables are often larger than can fit into memory on a single computer, so this module provides ways to read only the parts of the data you need. Partition filters allow you to skip reading files that are part of irrelevant partitions. Only loading the columns required also saves memory. Finally, some methods allow reading tables batch-by-batch, allowing you to process the whole table while only having a portion loaded at any given time.

    To load into Pandas or a PyArrow table use the DeltaTable.to_pandas and DeltaTable.to_pyarrow_table methods, respectively. Both of these support filtering partitions and selecting particular columns.

    >>> from deltalake import DeltaTable\n>>> dt = DeltaTable(\"../rust/tests/data/delta-0.8.0-partitioned\")\n>>> dt.schema().to_pyarrow()\nvalue: string\nyear: string\nmonth: string\nday: string\n>>> dt.to_pandas(partitions=[(\"year\", \"=\", \"2021\")], columns=[\"value\"])\n      value\n0     6\n1     7\n2     5\n3     4\n>>> dt.to_pyarrow_table(partitions=[(\"year\", \"=\", \"2021\")], columns=[\"value\"])\npyarrow.Table\nvalue: string\n

    Converting to a PyArrow Dataset allows you to filter on columns other than partition columns and load the result as a stream of batches rather than a single table. Convert to a dataset using DeltaTable.to_pyarrow_dataset. Filters applied to datasets will use the partition values and file statistics from the Delta transaction log and push down any other filters to the scanning operation.

    >>> import pyarrow.dataset as ds\n>>> dataset = dt.to_pyarrow_dataset()\n>>> condition = (ds.field(\"year\") == \"2021\") & (ds.field(\"value\") > \"4\")\n>>> dataset.to_table(filter=condition, columns=[\"value\"]).to_pandas()\n  value\n0     6\n1     7\n2     5\n>>> batch_iter = dataset.to_batches(filter=condition, columns=[\"value\"], batch_size=2)\n>>> for batch in batch_iter: print(batch.to_pandas())\n  value\n0     6\n1     7\n  value\n0     5\n

    PyArrow datasets may also be passed to compatible query engines, such as DuckDB

    >>> import duckdb\n>>> ex_data = duckdb.arrow(dataset)\n>>> ex_data.filter(\"year = 2021 and value > 4\").project(\"value\")\n---------------------\n-- Expression Tree --\n---------------------\nProjection [value]\n  Filter [year=2021 AND value>4]\n    arrow_scan(140409099470144, 4828104688, 1000000)\n\n---------------------\n-- Result Columns  --\n---------------------\n- value (VARCHAR)\n\n---------------------\n-- Result Preview  --\n---------------------\nvalue\nVARCHAR\n[ Rows: 3]\n6\n7\n5\n

    Finally, you can always pass the list of file paths to an engine. For example, you can pass them to dask.dataframe.read_parquet:

    >>> import dask.dataframe as dd\n>>> df = dd.read_parquet(dt.file_uris())\n>>> df\nDask DataFrame Structure:\n                value             year            month              day\nnpartitions=6\n               object  category[known]  category[known]  category[known]\n                  ...              ...              ...              ...\n...               ...              ...              ...              ...\n                  ...              ...              ...              ...\n                  ...              ...              ...              ...\nDask Name: read-parquet, 6 tasks\n>>> df.compute()\n  value  year month day\n0     1  2020     1   1\n0     2  2020     2   3\n0     3  2020     2   5\n0     4  2021     4   5\n0     5  2021    12   4\n0     6  2021    12  20\n1     7  2021    12  20\n
    "},{"location":"usage/read-cdf/","title":"Reading the Change Data Feed from a Delta Table","text":"

    Reading the CDF data from a table with change data is easy.

    "},{"location":"usage/read-cdf/#reading-cdf-log","title":"Reading CDF Log","text":"Python Rust
    import polars\nfrom deltalake import DeltaTable\n\ndt = DeltaTable(\"../rust/tests/data/cdf-table\")\ntable = dt.load_cdf(starting_version=0, ending_version=4).read_all()\npt = polars.from_arrow(table)\npt.group_by(\"_commit_version\").len().sort(\"len\", descending=True)\n
    #[tokio::main]\nasync fn main() -> Result<(), Box<dyn std::error::Error>> {\n\n    let table = deltalake::open_table(\"../rust/tests/data/cdf-table\").await?;\n    let ops = DeltaOps(table);\n    let cdf = ops.load_cdf()\n        .with_starting_version(0)\n        .with_ending_version(4)\n        .build()\n        .await?;\n\n    arrow_cast::pretty::print_batches(&cdf)?;\n\n    Ok(())\n}\n

    The output can then be used in various execution engines. The python example shows how one might consume the cdf feed inside polars.

    "},{"location":"usage/optimize/delta-lake-z-order/","title":"Delta Lake Z Order","text":"

    This section explains how to Z Order a Delta table.

    Z Ordering colocates similar data in the same files, which allows for better file skipping and faster queries.

    Suppose you have a table with first_name, age, and country columns.

    If you Z Order the data by the country column, then individuals from the same country will be stored in the same files. When you subquently query the data for individuals from a given country, it will execute faster because more data can be skipped.

    Here's how to Z Order a Delta table:

    dt = DeltaTable(\"tmp\")\ndt.optimize.z_order([country])\n
    "},{"location":"usage/optimize/small-file-compaction-with-optimize/","title":"Delta Lake small file compaction with optimize","text":"

    This post shows you how to perform small file compaction with using the optimize method. This was added to the DeltaTable class in version 0.9.0. This command rearranges the small files into larger files which will reduce the number of files and speed up queries.

    This is very helpful for workloads that append frequently. For example, if you have a table that is appended to every 10 minutes, after a year you will have 52,560 files in the table. If the table is partitioned by another dimension, you will have 52,560 files per partition; with just 100 unique values that's millions of files. By running optimize periodically, you can reduce the number of files in the table to a more manageable number.

    Typically, you will run optimize less frequently than you append data. If possible, you might run optimize once you know you have finished writing to a particular partition. For example, on a table partitioned by date, you might append data every 10 minutes, but only run optimize once a day at the end of the day. This will ensure you don't need to compact the same data twice.

    This section will also teach you about how to use vacuum to physically remove files from storage that are no longer needed. You\u2019ll often want vacuum after running optimize to remove the small files from storage once they\u2019ve been compacted into larger files.

    Let\u2019s start with an example to explain these key concepts. All the code covered in this post is stored in this notebook in case you\u2019d like to follow along.

    "},{"location":"usage/optimize/small-file-compaction-with-optimize/#create-a-delta-table-with-small-files","title":"Create a Delta table with small files","text":"

    Let\u2019s start by creating a Delta table with a lot of small files so we can demonstrate the usefulness of the optimize command.

    Start by writing a function that generates on thousand rows of random data given a timestamp.

    def record_observations(date: datetime) -> pa.Table:\n    \"\"\"Pulls data for a certain datetime\"\"\"\n    nrows = 1000\n    return pa.table(\n        {\n            \"date\": pa.array([date.date()] * nrows),\n            \"timestamp\": pa.array([date] * nrows),\n            \"value\": pc.random(nrows),\n        }\n    )\n

    Let\u2019s run this function and observe the output:

    record_observations(datetime(2021, 1, 1, 12)).to_pandas()\n\n    date                timestamp   value\n0   2021-01-01  2021-01-01 12:00:00 0.3186397383362023\n1   2021-01-01  2021-01-01 12:00:00 0.04253766974259088\n2   2021-01-01  2021-01-01 12:00:00 0.9355682965171573\n\u2026\n999 2021-01-01  2021-01-01 12:00:00 0.23207037062879843\n

    Let\u2019s write 100 hours worth of data to the Delta table.

    # Every hour starting at midnight on 2021-01-01\nhours_iter = (datetime(2021, 1, 1) + timedelta(hours=i) for i in itertools.count())\n\n# Write 100 hours worth of data\nfor timestamp in itertools.islice(hours_iter, 100):\n    write_deltalake(\n        \"observation_data\",\n        record_observations(timestamp),\n        partition_by=[\"date\"],\n        mode=\"append\",\n    )\n

    This data was appended to the Delta table in 100 separate transactions, so the table will contain 100 transaction log entries and 100 data files. You can see the number of files with the files() method.

    dt = DeltaTable(\"observation_data\")\nlen(dt.files()) # 100\n

    Here\u2019s how the files are persisted in storage.

    observation_data\n\u251c\u2500\u2500 _delta_log\n\u2502   \u251c\u2500\u2500 00000000000000000000.json\n\u2502   \u251c\u2500\u2500 \u2026\n\u2502   \u2514\u2500\u2500 00000000000000000099.json\n\u251c\u2500\u2500 date=2021-01-01\n\u2502   \u251c\u2500\u2500 0-cfe227c6-edd9-4369-a1b0-db4559a2e693-0.parquet\n\u2502   \u251c\u2500\u2500 \u2026\n\u2502   \u251c\u2500\u2500 23-a4ace29e-e73e-40a1-81d3-0f5dc13093de-0.parquet\n\u251c\u2500\u2500 date=2021-01-02\n\u2502   \u251c\u2500\u2500 24-9698b456-66eb-4075-8732-fe56d81edb60-0.parquet\n\u2502   \u251c\u2500\u2500 \u2026\n\u2502   \u2514\u2500\u2500 47-d3fce527-e018-4c02-8acd-a649f6f523d2-0.parquet\n\u251c\u2500\u2500 date=2021-01-03\n\u2502   \u251c\u2500\u2500 48-fd90a7fa-5a14-42ed-9f59-9fe48d87899d-0.parquet\n\u2502   \u251c\u2500\u2500 \u2026\n\u2502   \u2514\u2500\u2500 71-5f143ade-8ae2-4854-bdc5-61154175665f-0.parquet\n\u251c\u2500\u2500 date=2021-01-04\n\u2502   \u251c\u2500\u2500 72-477c10fe-dc09-4087-80f0-56006e4a7911-0.parquet\n\u2502   \u251c\u2500\u2500 \u2026\n\u2502   \u2514\u2500\u2500 95-1c92cbce-8af4-4fe4-9c11-832245cf4d40-0.parquet\n\u2514\u2500\u2500 date=2021-01-05\n    \u251c\u2500\u2500 96-1b878ee5-25fd-431a-bc3e-6dcacc96b470-0.parquet\n    \u251c\u2500\u2500 \u2026\n    \u2514\u2500\u2500 99-9650ed63-c195-433d-a86b-9469088c14ba-0.parquet\n

    Each of these Parquet files are tiny - they\u2019re only 10 KB. Let\u2019s see how to compact these tiny files into larger files, which is more efficient for data queries.

    "},{"location":"usage/optimize/small-file-compaction-with-optimize/#compact-small-files-in-the-delta-table-with-optimize","title":"Compact small files in the Delta table with optimize","text":"

    Let\u2019s run the optimize command to compact the existing small files into larger files:

    dt = DeltaTable(\"observation_data\")\n\ndt.optimize()\n

    Here\u2019s the output of the command:

    {'numFilesAdded': 5,\n 'numFilesRemoved': 100,\n 'filesAdded': {'min': 39000,\n  'max': 238282,\n  'avg': 198425.6,\n  'totalFiles': 5,\n  'totalSize': 992128},\n 'filesRemoved': {'min': 10244,\n  'max': 10244,\n  'avg': 10244.0,\n  'totalFiles': 100,\n  'totalSize': 1024400},\n 'partitionsOptimized': 5,\n 'numBatches': 1,\n 'totalConsideredFiles': 100,\n 'totalFilesSkipped': 0,\n 'preserveInsertionOrder': True}\n

    The optimize operation has added 5 new files and marked 100 exisitng files for removal (this is also known as \u201ctombstoning\u201d files). It has compacted the 100 tiny files into 5 larger files.

    Let\u2019s append some more data to the Delta table and see how we can selectively run optimize on the new data that\u2019s added.

    "},{"location":"usage/optimize/small-file-compaction-with-optimize/#handling-incremental-updates-with-optimize","title":"Handling incremental updates with optimize","text":"

    Let\u2019s append another 24 hours of data to the Delta table:

    for timestamp in itertools.islice(hours_iter, 24):\n    write_deltalake(\n        dt,\n        record_observations(timestamp),\n        partition_by=[\"date\"],\n        mode=\"append\",\n    )\n

    We can use get_add_actions() to introspect the table state. We can see that 2021-01-06 has only a few hours of data so far, so we don't want to optimize that yet. But 2021-01-05 has all 24 hours of data, so it's ready to be optimized.

    dt.get_add_actions(flatten=True).to_pandas()[\n    \"partition.date\"\n].value_counts().sort_index()\n\n2021-01-01     1\n2021-01-02     1\n2021-01-03     1\n2021-01-04     1\n2021-01-05    21\n2021-01-06     4\n

    To optimize a single partition, you can pass in a partition_filters argument speficying which partitions to optimize.

    dt.optimize(partition_filters=[(\"date\", \"=\", \"2021-01-05\")])\n\n{'numFilesAdded': 1,\n 'numFilesRemoved': 21,\n 'filesAdded': {'min': 238282,\n  'max': 238282,\n  'avg': 238282.0,\n  'totalFiles': 1,\n  'totalSize': 238282},\n 'filesRemoved': {'min': 10244,\n  'max': 39000,\n  'avg': 11613.333333333334,\n  'totalFiles': 21,\n  'totalSize': 243880},\n 'partitionsOptimized': 1,\n 'numBatches': 1,\n 'totalConsideredFiles': 21,\n 'totalFilesSkipped': 0,\n 'preserveInsertionOrder': True}\n

    This optimize operation tombstones 21 small data files and adds one file with all the existing data properly condensed. Let\u2019s take a look a portion of the _delta_log/00000000000000000125.json file, which is the transaction log entry that corresponds with this incremental optimize command.

    {\n  \"remove\": {\n    \"path\": \"date=2021-01-05/part-00000-41178aab-2491-488f-943d-8f03867295ee-c000.snappy.parquet\",\n    \"deletionTimestamp\": 1683465499480,\n    \"dataChange\": false,\n    \"extendedFileMetadata\": null,\n    \"partitionValues\": {\n      \"date\": \"2021-01-05\"\n    },\n    \"size\": 39000,\n    \"tags\": null\n  }\n}\n\n{\n  \"remove\": {\n    \"path\": \"date=2021-01-05/101-79ae6fc9-c0cc-49ec-bb94-9aba879ac949-0.parquet\",\n    \"deletionTimestamp\": 1683465499481,\n    \"dataChange\": false,\n    \"extendedFileMetadata\": null,\n    \"partitionValues\": {\n      \"date\": \"2021-01-05\"\n    },\n    \"size\": 10244,\n    \"tags\": null\n  }\n}\n\n\u2026\n\n{\n  \"add\": {\n    \"path\": \"date=2021-01-05/part-00000-4b020a40-c836-4a11-851f-4691370c9f3a-c000.snappy.parquet\",\n    \"size\": 238282,\n    \"partitionValues\": {\n      \"date\": \"2021-01-05\"\n    },\n    \"modificationTime\": 1683465499493,\n    \"dataChange\": false,\n    \"stats\": \"{\\\"numRecords\\\":24000,\\\"minValues\\\":{\\\"value\\\":0.00005581532256615507,\\\"timestamp\\\":\\\"2021-01-05T00:00:00.000Z\\\"},\\\"maxValues\\\":{\\\"timestamp\\\":\\\"2021-01-05T23:00:00.000Z\\\",\\\"value\\\":0.9999911402868216},\\\"nullCount\\\":{\\\"timestamp\\\":0,\\\"value\\\":0}}\",\n    \"tags\": null\n  }\n}\n

    The trasaction log indicates that many files have been tombstoned and one file is added, as expected.

    The Delta Lake optimize command \u201cremoves\u201d data by marking the data files as removed in the transaction log. The optimize command doesn\u2019t physically delete the Parquet file from storage. Optimize performs a \u201clogical remove\u201d not a \u201cphysical remove\u201d.

    Delta Lake uses logical operations so you can time travel back to earlier versions of your data. You can vacuum your Delta table to physically remove Parquet files from storage if you don\u2019t need to time travel and don\u2019t want to pay to store the tombstoned files.

    "},{"location":"usage/optimize/small-file-compaction-with-optimize/#vacuuming-after-optimizing","title":"Vacuuming after optimizing","text":"

    The vacuum command deletes all files from storage that are marked for removal in the transaction log and older than the retention period which is 7 days by default.

    It\u2019s normally a good idea to have a retention period of at least 7 days. For purposes of this example, we will set the retention period to zero, just so you can see how the files get removed from storage. Adjusting the retention period in this manner isn\u2019t recommended for production use cases.

    Let\u2019s run the vacuum command:

    dt.vacuum(retention_hours=0, enforce_retention_duration=False, dry_run=False)\n

    The command returns a list of all the files that are removed from storage:

    ['date=2021-01-02/39-a98680f2-0e0e-4f26-a491-18b183f9eb05-0.parquet',\n 'date=2021-01-02/41-e96bc8bb-c571-484c-b534-e897424fb7da-0.parquet',\n \u2026\n 'date=2021-01-01/0-cfe227c6-edd9-4369-a1b0-db4559a2e693-0.parquet',\n 'date=2021-01-01/18-ded53418-172b-4e40-bf2e-7c8142e71bd1-0.parquet']\n

    Let\u2019s look at the content of the Delta table now that all the really small files have been removed from storage:

    observation_data\n\u251c\u2500\u2500 _delta_log\n\u2502   \u251c\u2500\u2500 00000000000000000000.json\n\u2502   \u251c\u2500\u2500 00000000000000000001.json\n\u2502   \u251c\u2500\u2500 \u2026\n\u2502   \u251c\u2500\u2500 00000000000000000124.json\n\u2502   \u2514\u2500\u2500 00000000000000000125.json\n\u251c\u2500\u2500 date=2021-01-01\n\u2502   \u2514\u2500\u2500 part-00000-31e3df5a-8bbe-425c-b85d-77794f922837-c000.snappy.parquet\n\u251c\u2500\u2500 date=2021-01-02\n\u2502   \u2514\u2500\u2500 part-00000-8af07878-b179-49ce-a900-d58595ffb60a-c000.snappy.parquet\n\u251c\u2500\u2500 date=2021-01-03\n\u2502   \u2514\u2500\u2500 part-00000-5e980864-b32f-4686-a58d-a75fae455c1e-c000.snappy.parquet\n\u251c\u2500\u2500 date=2021-01-04\n\u2502   \u2514\u2500\u2500 part-00000-1e82d23b-084d-47e3-9790-d68289c39837-c000.snappy.parquet\n\u251c\u2500\u2500 date=2021-01-05\n\u2502   \u2514\u2500\u2500 part-00000-4b020a40-c836-4a11-851f-4691370c9f3a-c000.snappy.parquet\n\u2514\u2500\u2500 date=2021-01-06\n    \u251c\u2500\u2500 121-0ecb5d70-4a28-4cd4-b2d2-89ee2285eaaa-0.parquet\n    \u251c\u2500\u2500 122-6b2d2758-9154-4392-b287-fe371ee507ec-0.parquet\n    \u251c\u2500\u2500 123-551d318f-4968-441f-83fc-89f98cd15daf-0.parquet\n    \u2514\u2500\u2500 124-287309d3-662e-449d-b4da-2e67b7cc0557-0.parquet\n

    All the partitions only contain a single file now, except for the date=2021-01-06 partition that has not been compacted yet.

    An entire partition won\u2019t necessarily get compacted to a single data file when optimize is run. Each partition has data files that are condensed to the target file size.

    "},{"location":"usage/optimize/small-file-compaction-with-optimize/#what-causes-the-small-file-problem","title":"What causes the small file problem?","text":"

    Delta tables can accumulate small files for a variety of reasons:

    • User error: users can accidentally write files that are too small. Users should sometimes repartition in memory before writing to disk to avoid appending files that are too small.
    • Frequent appends: systems that append more often tend to append more smaller files. A pipeline that appends every minute will generally generate ten times as many small files compared to a system that appends every ten minutes.
    • Appending to partitioned data lakes with high cardinality columns can also cause small files. If you append every hour to a table that\u2019s partitioned on a column with 1,000 distinct values, then every append could create 1,000 new files. Partitioning by date avoids this problem because the data isn\u2019t split up across partitions in this manner.
    "},{"location":"usage/optimize/small-file-compaction-with-optimize/#conclusion","title":"Conclusion","text":"

    This page showed you how to create a Delta table with many small files, compact the small files into larger files with optimize, and remove the tombstoned files from storage with vacuum.

    You also learned about how to incrementally optimize partitioned Delta tables, so you only compact newly added data.

    An excessive number of small files slows down Delta table queries, so periodic compaction is important. Make sure to properly maintain your Delta tables, so performance does not degrade over time.

    "},{"location":"usage/writing/","title":"Writing Delta Tables","text":"

    For overwrites and appends, use write_deltalake. If the table does not already exist, it will be created. The data parameter will accept a Pandas DataFrame, a PyArrow Table, or an iterator of PyArrow Record Batches.

    >>> from deltalake import write_deltalake\n>>> df = pd.DataFrame({'x': [1, 2, 3]})\n>>> write_deltalake('path/to/table', df)\n

    Note: write_deltalake accepts a Pandas DataFrame, but will convert it to a Arrow table before writing. See caveats in pyarrow:python/pandas.

    By default, writes create a new table and error if it already exists. This is controlled by the mode parameter, which mirrors the behavior of Spark's pyspark.sql.DataFrameWriter.saveAsTable DataFrame method. To overwrite pass in mode='overwrite' and to append pass in mode='append':

    >>> write_deltalake('path/to/table', df, mode='overwrite')\n>>> write_deltalake('path/to/table', df, mode='append')\n

    write_deltalake will raise ValueError if the schema of the data passed to it differs from the existing table's schema. If you wish to alter the schema as part of an overwrite pass in schema_mode=\"overwrite\" or schema_mode=\"merge\". schema_mode=\"overwrite\" will completely overwrite the schema, even if columns are dropped; merge will append the new columns and fill missing columns with null. schema_mode=\"merge\" is also supported on append operations.

    "},{"location":"usage/writing/#overwriting-a-partition","title":"Overwriting a partition","text":"

    You can overwrite a specific partition by using mode=\"overwrite\" together with partition_filters. This will remove all files within the matching partition and insert your data as new files. This can only be done on one partition at a time. All of the input data must belong to that partition or else the method will raise an error.

    >>> from deltalake import write_deltalake\n>>> df = pd.DataFrame({'x': [1, 2, 3], 'y': ['a', 'a', 'b']})\n>>> write_deltalake('path/to/table', df, partition_by=['y'])\n\n>>> table = DeltaTable('path/to/table')\n>>> df2 = pd.DataFrame({'x': [100], 'y': ['b']})\n>>> write_deltalake(table, df2, partition_filters=[('y', '=', 'b')], mode=\"overwrite\")\n\n>>> table.to_pandas()\n     x  y\n0    1  a\n1    2  a\n2  100  b\n

    This method could also be used to insert a new partition if one doesn't already exist, making this operation idempotent.

    "},{"location":"usage/writing/#overwriting-part-of-the-table-data-using-a-predicate","title":"Overwriting part of the table data using a predicate","text":"

    Note

    This predicate is often called a replaceWhere predicate

    When you don\u2019t specify the predicate, the overwrite save mode will replace the entire table. Instead of replacing the entire table (which is costly!), you may want to overwrite only the specific parts of the table that should be changed. In this case, you can use a predicate to overwrite only the relevant records or partitions.

    Note

    Data written must conform to the same predicate, i.e. not contain any records that don't match the predicate condition, otherwise the operation will fail

    Python Rust

    replaceWhere

    import pyarrow as pa\nfrom deltalake import write_deltalake\n\n# Assuming there is already a table in this location with some records where `id = '1'` which we want to overwrite\ntable_path = \"/tmp/my_table\"\ndata = pa.table(\n    {\n        \"id\": pa.array([\"1\", \"1\"], pa.string()),\n        \"value\": pa.array([11, 12], pa.int64()),\n    }\n)\nwrite_deltalake(\n    table_path,\n    data,\n    mode=\"overwrite\",\n    predicate=\"id = '1'\",\n    engine=\"rust\",\n)\n

    replaceWhere

    // Assuming there is already a table in this location with some records where `id = '1'` which we want to overwrite\nuse arrow_array::RecordBatch;\nuse arrow_schema::{DataType, Field, Schema as ArrowSchema};\nuse deltalake::datafusion::logical_expr::{col, lit};\nuse deltalake::protocol::SaveMode;\nuse deltalake::DeltaOps;\n\nlet schema = ArrowSchema::new(vec![\n    Field::new(\"id\", DataType::Utf8, true),\n    Field::new(\"value\", DataType::Int32, true),\n]);\n\nlet data = RecordBatch::try_new(\n    schema.into(),\n    vec![\n        Arc::new(arrow::array::StringArray::from(vec![\"1\", \"1\"])),\n        Arc::new(arrow::array::Int32Array::from(vec![11, 12])),\n    ],\n)\n.unwrap();\n\nlet table = deltalake::open_table(\"/tmp/my_table\").await.unwrap();\nlet _table = DeltaOps(table)\n    .write(vec![data])\n    .with_save_mode(SaveMode::Overwrite)\n    .with_replace_where(col(\"id\").eq(lit(\"1\")))\n    .await\n    .unwrap();\n

    "},{"location":"usage/writing/writing-to-s3-with-locking-provider/","title":"Writing to S3 with a locking provider","text":"

    A locking mechanism is needed to prevent unsafe concurrent writes to a delta lake directory when writing to S3.

    "},{"location":"usage/writing/writing-to-s3-with-locking-provider/#dynamodb","title":"DynamoDB","text":"

    DynamoDB is the only available locking provider at the moment in delta-rs. To enable DynamoDB as the locking provider, you need to set the AWS_S3_LOCKING_PROVIDER to 'dynamodb' as a storage_options or as an environment variable.

    Additionally, you must create a DynamoDB table with the name delta_log so that it can be automatically recognized by delta-rs. Alternatively, you can use a table name of your choice, but you must set the DELTA_DYNAMO_TABLE_NAME variable to match your chosen table name. The required schema for the DynamoDB table is as follows:

    \"Table\": {\n    \"AttributeDefinitions\": [\n        {\n            \"AttributeName\": \"fileName\",\n            \"AttributeType\": \"S\"\n        },\n        {\n            \"AttributeName\": \"tablePath\",\n            \"AttributeType\": \"S\"\n        }\n    ],\n    \"TableName\": \"delta_log\",\n    \"KeySchema\": [\n        {\n            \"AttributeName\": \"tablePath\",\n            \"KeyType\": \"HASH\"\n        },\n        {\n            \"AttributeName\": \"fileName\",\n            \"KeyType\": \"RANGE\"\n        }\n    ],\n}\n

    Here is an example writing to s3 using this mechanism:

    from deltalake import write_deltalake\ndf = pd.DataFrame({'x': [1, 2, 3]})\nstorage_options = {'AWS_S3_LOCKING_PROVIDER': 'dynamodb', 'DELTA_DYNAMO_TABLE_NAME': 'custom_table_name'}\nwrite_deltalake('s3a://path/to/table', df, 'storage_options'= storage_options)\n

    This locking mechanism is compatible with the one used by Apache Spark. The tablePath property, denoting the root url of the delta table itself, is part of the primary key, and all writers intending to write to the same table must match this property precisely. In Spark, S3 URLs are prefixed with s3a://, and a table in delta-rs must be configured accordingly.

    The following code allows creating the necessary table from the AWS cli:

    aws dynamodb create-table \\\n--table-name delta_log \\\n--attribute-definitions AttributeName=tablePath,AttributeType=S AttributeName=fileName,AttributeType=S \\\n--key-schema AttributeName=tablePath,KeyType=HASH AttributeName=fileName,KeyType=RANGE \\\n--provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5\n

    You can find additional information in the delta-rs-documentation, which also includes recommendations on configuring a time-to-live (TTL) for the table to avoid growing the table indefinitely.

    "},{"location":"usage/writing/writing-to-s3-with-locking-provider/#enable-unsafe-writes-in-s3-opt-in","title":"Enable unsafe writes in S3 (opt-in)","text":"

    If for some reason you don't want to use dynamodb as your locking mechanism you can choose to set the AWS_S3_ALLOW_UNSAFE_RENAME variable to true in order to enable S3 unsafe writes.

    "},{"location":"usage/writing/writing-to-s3-with-locking-provider/#required-permissions","title":"Required permissions","text":"

    You need to have permissions to get, put and delete objects in the S3 bucket you're storing your data in. Please note that you must be allowed to delete objects even if you're just appending to the deltalake, because there are temporary files into the log folder that are deleted after usage.

    In AWS, those would be the required permissions: - s3:GetObject - s3:PutObject - s3:DeleteObject

    In DynamoDB, you need those permissions: - dynamodb:GetItem - dynamodb:Query - dynamodb:PutItem - dynamodb:UpdateItem

    "}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"The deltalake package","text":"

    This is the documentation for the native Rust/Python implementation of Delta Lake. It is based on the delta-rs Rust library and requires no Spark or JVM dependencies. For the PySpark implementation, see delta-spark instead.

    This module provides the capability to read, write, and manage Delta Lake tables with Python or Rust without Spark or Java. It uses Apache Arrow under the hood, so is compatible with other Arrow-native or integrated libraries such as pandas, DuckDB, and Polars.

    "},{"location":"#important-terminology","title":"Important terminology","text":"
    • \"Rust deltalake\" refers to the Rust API of delta-rs (no Spark dependency)
    • \"Python deltalake\" refers to the Python API of delta-rs (no Spark dependency)
    • \"Delta Spark\" refers to the Scala implementation of the Delta Lake transaction log protocol. This depends on Spark and Java.
    "},{"location":"#why-implement-the-delta-lake-transaction-log-protocol-in-rust-and-scala","title":"Why implement the Delta Lake transaction log protocol in Rust and Scala?","text":"

    Delta Spark depends on Java and Spark, which is fine for many use cases, but not all Delta Lake users want to depend on these libraries. delta-rs allows using Delta Lake in Rust or other native projects when using a JVM is often not an option.

    Python deltalake lets you query Delta tables without depending on Java/Scala.

    Suppose you want to query a Delta table with pandas on your local machine. Python deltalake makes it easy to query the table with a simple pip install command - no need to install Java.

    "},{"location":"#contributing","title":"Contributing","text":"

    The Delta Lake community welcomes contributors from all developers, regardless of your experience or programming background.

    You can write Rust code, Python code, documentation, submit bugs, or give talks to the community. We welcome all of these contributions.

    Feel free to join our Slack and message us in the #delta-rs channel any time!

    We value kind communication and building a productive, friendly environment for maximum collaboration and fun.

    "},{"location":"#project-history","title":"Project history","text":"

    Check out this video by Denny Lee & QP Hou to learn about the genesis of the delta-rs project:

    "},{"location":"delta-lake-best-practices/","title":"Delta Lake Best Practices","text":"

    This page outlines Delta Lake best practices.

    You should consider several factors to optimize the performance of your Delta tables and minimize costs.

    The Delta Lake best practices depend on your data ingestion into the Delta table and query patterns. You must understand your data and how users run queries to best leverage Delta Lake.

    "},{"location":"delta-lake-best-practices/#compacting-small-files","title":"Compacting small files","text":"

    Delta tables work best when the files are \u201cright-sized\u201d. Files that are too small create I/O overhead. Files that are too large limit the impact of file skipping (a critical query optimization).

    Delta tables can accumulate a lot of small files, especially if you\u2019re frequently writing small amounts of data. If your table has many small files, you should run a small compaction operation to consolidate all the tiny files into \u201cright-sized\u201d files.

    It\u2019s generally best for files in a Delta table to be between 100MB and 1GB, but that can vary based on the overall size of the table and the query patterns.

    Delta Lake makes it easy to compact the small files.

    "},{"location":"delta-lake-best-practices/#optimizing-table-layout","title":"Optimizing table layout","text":"

    You can colocate similar data in the same files to make file skipping more effective. Delta Lake supports Z Ordering, which can colocate similar data in the same files.

    Z Ordering can yield impressive performance gains for low-cardinality columns but also works well for high-cardinality columns. This is an advantage compared to Hive-style partitioning, which is only suitable for low-cardinality columns.

    You must analyze the most common query patterns and Z Order your dataset based on the columns allowing the most file skipping. The ability to colocate data in the same files decreases when you add more Z Order columns.

    Let\u2019s look at Hive-style partitioning, another way to colocate data in the same files. You can also use Hive-style partitioning in conjunction with Z Ordering.

    "},{"location":"delta-lake-best-practices/#partitioning-datasets","title":"Partitioning datasets","text":"

    You can partition your Delta tables, which separates the data by one or more partition keys into separate folders. Partitioning can be an excellent performance optimization (when you filter on the partition key) and is a good way to sidestep concurrency conflict issues.

    Hive-style partitioning also has some significant downsides.

    • It\u2019s only suitable for low-cardinality columns.
    • It can create many small files, especially if you use the wrong partition key or frequently update the Delta table.
    • It can cause some queries that don\u2019t rely on the partition key to run slower (because of the excessive number of small files). A large number of small files is problematic for I/O throughput.

    Hive-style partitioning can be a great data management tactic and a fantastic option for many Delta tables. Beware of the downsides before partitioning your tables.

    You can use Hive-style partitioning in conjunction with Z Ordering. You can partition a table by one column and Z Order by another. They\u2019re different tactics that aim to help you skip more files and run queries faster.

    Let\u2019s look at some of the built-in Delta features that help maintain the integrity of your tables.

    "},{"location":"delta-lake-best-practices/#use-appropriate-quality-controls","title":"Use appropriate quality controls","text":"

    Delta Lake supports schema enforcement and column constraints to protect the integrity of your data.

    Delta Lake enabled schema enforcement by default, so you can only append data to an existing table with the same exact schema. You can bypass schema enforcement by enabling schema evolution, which allows you to append mismatched schemas to a table.

    You should only enable schema evolution when you want to allow the schema of your table to change. You should not enable schema evolution if you don\u2019t want this flexibility. Schema enforcement is a good default setting.

    Column-level constraints prevent you from appending data that fail SQL predicates. For example, you may add a constraint that requires all the values in the age column of a table to be positive.

    You should add column constraints to your table whenever you want a column only to include values that satisfy a predicate.

    No data is appended when you apply a constraint and a row check fails. For example, if you try to append 100 rows of data to a table and one row has a failing check, then no data is added.

    When you have column constraints, it\u2019s often a good idea to append the failing data to a \u201cquarantine table\u201d and the passing data to the main Delta table. Or you can filter out the failing rows and just append the passing rows. Keeping a history of the failing rows in a quarantine table is helpful for debugging.

    See here to learn more about Delta Lake constraints.

    "},{"location":"delta-lake-best-practices/#best-practices-for-dml-operations","title":"Best practices for DML operations","text":"

    DML operations like deleting, updating, and merging write existing data in new files and mark existing files for deletion in the transaction log. Rewriting data files is expensive, so you want to minimize the number of rewritten files when you run DML operations.

    Delta Lake supports a table feature called deletion vectors that implements DML transactions more efficiently under the hood. Enabling deletion vectors is usually the best way to make DML operations run faster. Note: delta-rs doesn\u2019t support deletion vectors yet.

    You should periodically purge deletion vectors because they can accumulate and slow subsequent read operations. Once you enable the feature, you must purge the deletion vectors in your table with an appropriate cadence.

    "},{"location":"delta-lake-best-practices/#use-vacuum-to-save-storage-costs","title":"Use vacuum to save storage costs","text":"

    Delta Lake supports transactions, which necessitates keeping old versions of data in storage, even the files marked for removal in the transactions log.

    Keeping old versions of Delta tables in storage is often desirable because it allows for versioned data, time travel, and rolling back tables to a previous state.

    If you don\u2019t want to leverage older versions of a table, then you should remove the legacy files from storage with the vacuum command. Vacuum will remove all files older than the table retention period and marked for removal in the transaction log.

    You only need to vacuum when you perform operations that mark files for removal in the transaction log. An append-only table doesn\u2019t create legacy files that need to be vacuumed.

    Create a good vacuum strategy for your tables to minimize your storage costs.

    "},{"location":"delta-lake-best-practices/#delta-lake-best-practices-to-minimize-costs","title":"Delta Lake best practices to minimize costs","text":"

    Delta Lake helps you minimize costs in many ways:

    • It's a free, open source format (based on Parquet). It's not a proprietary format that you need to pay for.
    • Delta tables store column-level min/max values in the transaction log, allowing file skipping.
    • Delta tables can be optimized (small file compaction, Z Ordering, etc.), so your queries run faster. When your queries run faster, then you pay less on compute.
    • Deletion vectors let you perform DML operations (delete, update, merge) much faster. If your delete operation runs 100x faster, then you pay 100x less compute.
    • It's easy to remove legacy files from storage with VACUUM, which minimizes storage costs.

    You should understand your organization\u2019s query patterns and use these features to minimize the overall cost. You need to assess tradeoffs. For example, Z Ordering is a computation that costs money, but it can save you lots of money in the long run if all your subsequent queries run a lot faster and use less compute.

    "},{"location":"delta-lake-best-practices/#collect-metadata-stats-on-columns-used-for-file-skipping","title":"Collect metadata stats on columns used for file skipping","text":"

    Delta tables don\u2019t always store each column's min/max values. Some Delta Lake implementations only store min/max values for the first 32 columns in the table, for example.

    Delta Lake can only apply file-skipping when it has min/max values for the relevant columns stored in the transaction log. Suppose you\u2019re running a filtering operation on col_a, for example. Delta Lake can only apply file skipping when the transaction log stores col_a min/max metadata.

    Ensure the transaction log stores metadata stats for all the columns that benefit from file skipping.

    "},{"location":"delta-lake-best-practices/#dont-collect-column-metadata-when-its-unnecessary","title":"Don\u2019t collect column metadata when it\u2019s unnecessary","text":"

    It takes some time to compute column statistics when writing files, and it isn\u2019t worth the effort if you cannot use the column for file skipping.

    Suppose you have a table column containing a long string of arbitrary text. It\u2019s unlikely that this column would ever provide any data-skipping benefits. So, you can just avoid the overhead of collecting the statistics for this particular column.

    "},{"location":"delta-lake-best-practices/#additional-reading","title":"Additional reading","text":"

    Delta Lake relies on transactions, and you should check out this page to learn more.

    Many Delta Lake performance benefits rely on file skipping, which you should understand well to get the most out of Delta.

    "},{"location":"delta-lake-best-practices/#conclusion","title":"Conclusion","text":"

    Delta Lake is a powerful technology that makes your data pipelines more reliable, saves money, and makes everyday data processing tasks easy.

    You need to learn how Delta Lake works at a high level to leverage Delta's power fully. You will not be able to leverage Delta Lake\u2019s full performance potential if your table has improperly sized files or if you\u2019re not colocating data in the same files to maximize data skipping, for example.

    Luckily, there are only a few details that are important to learn. You don\u2019t need to know the implementation details - just the essential high-level concepts.

    "},{"location":"delta-lake-big-data-small-data/","title":"Delta Lake for big data and small data","text":"

    Delta Lake is an excellent storage format for big data and small data.

    This post explains why Delta Lake is suitable for massive datasets and why many of these features that are great, even for tiny tables. Delta Lake is fine for a table with less than 1 GB of data or hundreds of petabytes of data.

    Let\u2019s start by discussing the features that are great for small data.

    "},{"location":"delta-lake-big-data-small-data/#delta-lake-for-small-data-tables","title":"Delta Lake for small data tables","text":"

    Delta Lake has many features that are useful for small datasets:

    • Reliable transactions
    • Better performance via file skipping
    • DML operations to make deletes, updates, and merges easy and performant
    • Features like schema enforcement and constraints to enforce data quality
    • Versioned data & time travel

    All of these features are great for large and small tables.

    Delta Lake DML operations are ACID transactions, so they either finish entirely or don\u2019t finish at all. Delta tables don\u2019t require any downtime while DML operations are running. The Delta Lake user experience is better than a data lake that doesn\u2019t support transactions and has downtime while running DML operations.

    The Delta Lake API also makes it easy to run DML operations. You can delete a line of code from a Delta table with a single line of code. Writing code to delete rows from CSV files is more challenging, especially if you want to implement this operation efficiently.

    Delta Lake has built-in checks to retain the integrity of your tables. For example, Delta tables have schema enforcement and prevent you from appending DataFrames with mismatched schema from the existing table. Delta Lake also lets you add constraints that only allow appending specific values to a column. Data quality is also essential for small tables!

    Delta Lake splits data into multiple files with file-level metadata in the transaction log, so query engines can sometimes skip data. Data skipping can be a huge performance benefit, depending on how much data can be ignored by the query engine.

    As previously mentioned, Delta tables record all DML operations as transactions. Recording operations as transactions means that existing data isn\u2019t mutated. So Delta Lake provides versioned data and time travel out of the box. Versioning data is better because it allows you to roll back mistakes and compare the state of the table at different points in time.

    Delta Lake has many useful features for small data tables. Let\u2019s look at how Delta Lake is scalable for massive datasets.

    "},{"location":"delta-lake-big-data-small-data/#delta-lake-for-large-data-tables","title":"Delta Lake for large data tables","text":"

    Delta Lake is designed to be scalable and can handle tables with terabytes or petabytes of data.

    See here for an example of an organization ingesting 220 TB of data into a Delta table daily.

    Delta tables store data in Parquet files, and cloud object stores allow engines to write any number of files. Delta tables store metadata information in the transaction log as JSON files, which are periodically compacted into Parquet files, so an arbitrarily large amount of Delta table metadata can also be stored.

    Delta Lake transactions and concurrency protection maintain the integrity of tables, even for large write operations or long-running computations.

    It\u2019s well known that Delta tables are scalable, even for the most enormous tables.

    "},{"location":"delta-lake-big-data-small-data/#small-data-operations-on-large-tables","title":"Small data operations on large tables","text":"

    Delta Lake is flexible and allows you to use \u201csmall data engines,\u201d even for large tables, depending on the computation.

    Suppose you have a Delta table containing 10 TB of data and a pipeline that appends 0.5 GB of data to the table every hour. You don\u2019t need a big data query engine to append a small amount of data. You can set up this job to run the Delta table append with a small data engine like pandas or Polars.

    Delta tables are flexible and interoperable with many technologies so that you can use the right tool for each data processing job. This allows you to design pipelines how you\u2019d like and minimize costs.

    "},{"location":"delta-lake-big-data-small-data/#when-delta-lake-isnt-needed","title":"When Delta Lake isn\u2019t needed","text":"

    You don\u2019t need Delta Lake for a small dataset that never changes and can be stored in a single Parquet file.

    Suppose you have a 0.5 GB dataset in a Parquet file that never needs to be updated. You can just keep that data in a Parquet table. Reading the metadata from the Parquet footer of a single file isn\u2019t expensive. You won\u2019t be taking advantage of Delta Lake's features like transactions, convenient DML operations, or versioned data.

    But in most cases, it\u2019s best to use Delta Lake because its features protect the integrity of your tables and make your life easier.

    "},{"location":"delta-lake-big-data-small-data/#conclusion","title":"Conclusion","text":"

    Delta Lake is well known for being scalable to huge tables but is also an excellent technology for small tables.

    Delta Lake is a lightweight technology, so there is little overhead. Writing the metadata file after performing a transaction is fast. It\u2019s a minuscule cost, considering the benefits you receive.

    Many reasons that make Delta Lake better than data lakes for large tables also apply to small tables!

    "},{"location":"why-use-delta-lake/","title":"Why use Delta Lake","text":"

    This page explains why Delta Lake is a better storage format for most tabular data analyses than data lake alternatives.

    Delta Lake provides developer-friendly features, reliable transactions, and fast performance compared with alternatives like Parquet or CSV.

    "},{"location":"why-use-delta-lake/#fast-performance","title":"Fast performance","text":"

    Delta tables store data in Parquet files and persist file-level metadata in the transaction log.

    This offers two main performance advantages:

    • File skipping based on metadata that\u2019s quickly accessible
    • Easy identification of all file paths for the table, compared to file listing operations that can be slow, especially on cloud object stores

    Delta Lake stores min/max values for each column of each file in the table. Certain queries can skip entire files based on the metadata. File skipping can be a massive performance optimization.

    Delta Lake also makes it easy to rearrange data in the table, so more file skipping is possible. For example, the table can be partitioned or Z Ordered, so that similar data is colocated in the same files and data skipping is optimal for your query patterns.

    For data lakes, you need to run file listing operations to get the file paths before you can actually read the data. Listing all the files in a data lake can take a long time, especially if there are a lot of files and they are stored in Hive-style partitions.

    Delta Lake stores all the file paths in the transaction log. So you can quickly get the file paths directly from the log and then run your query. Delta Lake also stores the file-level metadata in the transaction log which is quicker than opening all the files in the data lake and grabbing the metadata from the file footer.

    "},{"location":"why-use-delta-lake/#developer-friendly-features","title":"Developer friendly features","text":"

    Many basic data operations are hard in data lakes but quite easy with Delta Lake. The only data operation that\u2019s easy with a data lake is appending data. Delta Lake makes all data operations easy including the following:

    • Appends
    • Upserts
    • Deletes
    • Replace where

    Even deleting a few rows of data from a data lake is hard. It\u2019s even harder if you want to run the operation in a performant manner.

    Delta Lake makes it easy to run common data operations and executes them performantly under the hood.

    Delta Lake also executes write operations as transactions, which makes data operations safer and prevents downtime. Write operations will cause data lakes to be in an unstable state while the computations is running. For example, if you read a data lake while a delete operation is running, then you may get the wrong data.

    Let\u2019s explore the benefits of reliable transactions in more detail.

    "},{"location":"why-use-delta-lake/#reliable-transactions","title":"Reliable transactions","text":"

    Delta Lake supports transactions which means that write operations have the following characteristics:

    • They either finish completely or don\u2019t run at all
    • They are executed in a serial manner and don\u2019t conflict with other transactions
    • They don\u2019t corrupt a table or violate table constraints

    Data lakes don\u2019t support transactions, so the write operations can cause the following errors:

    • There is no schema enforcement, so you can append data to a data lake with a mismatching schema
    • Reading the data lake often yields incorrect results while write transactions are performed
    • Data lakes can be corrupted for invalid write operations or computations that error-out
    • Concurrent transactions that conflict can cause data loss

    Production data systems should rely on storage systems like Delta Lake that support transactions.

    "},{"location":"why-use-delta-lake/#interoperability","title":"Interoperability","text":"

    Delta Lake tables are interoperable and can be read/written by multiple different query engines.

    For example, you can create a Delta table with Spark, append to it with pandas, and then read it with Polars.

    Delta tables are powerful because they are interoperable with various query engines and computation runtimes.

    Suppose you have a Delta table that\u2019s updated with an AWS Lambda function every 5 minutes. There is only a small amount of data collected every 5 minutes, so a lightweight runtime like AWS Lambda is sufficient.

    Further suppose that the overall table is quite large. So when you want to perform DML operations or query the whole table, your team uses a Spark cluster.

    Delta Lake is flexible to allow these types of operations from multiple readers and writers. This provides teams with the flexibility to choose the right tool for the job.

    "},{"location":"why-use-delta-lake/#support-for-many-languages","title":"Support for many languages","text":"

    Delta tables can be queried with a variety of different languages. This project provides APIs for Rust and Python users and does not depend on Java or Scala. This project is a great alternative for pandas, Polars, DuckDB, or DataFusion.

    Delta Lake supports many languages and even more language support is coming soon!

    "},{"location":"why-use-delta-lake/#support-on-multiple-clouds","title":"Support on multiple clouds","text":"

    Delta Lake supports multiple clouds including GCP, AWS, and Azure.

    You can also use Delta Lake on your local machine or in an on-prem environment.

    Delta Lake is quite portable.

    "},{"location":"why-use-delta-lake/#conclusion","title":"Conclusion","text":"

    Delta Lake is a mature table format that offers users tons of advantages over a data lake with virtually no downsides.

    Once you start using Delta Lake, you will never want to go back to data lakes that expose you to a variety of dangerous bugs, poor performance, and reliability issues.

    The Delta Lake community is also welcome and open. We gladly accept new contributors and help users with their questions.

    "},{"location":"api/catalog/","title":"Catalog","text":"","boost":2},{"location":"api/catalog/#deltalake.data_catalog.DataCatalog","title":"deltalake.data_catalog.DataCatalog","text":"

    Bases: Enum

    List of the Data Catalogs

    ","boost":2},{"location":"api/catalog/#deltalake.data_catalog.DataCatalog.AWS","title":"AWS class-attribute instance-attribute","text":"
    AWS = 'glue'\n

    Refers to the AWS Glue Data Catalog <https://docs.aws.amazon.com/glue/latest/dg/catalog-and-crawler.html>_

    ","boost":2},{"location":"api/catalog/#deltalake.data_catalog.DataCatalog.UNITY","title":"UNITY class-attribute instance-attribute","text":"
    UNITY = 'unity'\n

    Refers to the Databricks Unity Catalog <https://docs.databricks.com/data-governance/unity-catalog/index.html>_

    ","boost":2},{"location":"api/delta_writer/","title":"Writer","text":"","boost":10},{"location":"api/delta_writer/#write-to-delta-tables","title":"Write to Delta Tables","text":"","boost":10},{"location":"api/delta_writer/#deltalake.write_deltalake","title":"deltalake.write_deltalake","text":"
    write_deltalake(table_or_uri: Union[str, Path, DeltaTable], data: Union[pd.DataFrame, ds.Dataset, pa.Table, pa.RecordBatch, Iterable[pa.RecordBatch], RecordBatchReader], *, schema: Optional[Union[pa.Schema, DeltaSchema]] = None, partition_by: Optional[Union[List[str], str]] = None, mode: Literal['error', 'append', 'overwrite', 'ignore'] = 'error', file_options: Optional[ds.ParquetFileWriteOptions] = None, max_partitions: Optional[int] = None, max_open_files: int = 1024, max_rows_per_file: int = 10 * 1024 * 1024, min_rows_per_group: int = 64 * 1024, max_rows_per_group: int = 128 * 1024, name: Optional[str] = None, description: Optional[str] = None, configuration: Optional[Mapping[str, Optional[str]]] = None, schema_mode: Optional[Literal['merge', 'overwrite']] = None, storage_options: Optional[Dict[str, str]] = None, partition_filters: Optional[List[Tuple[str, str, Any]]] = None, predicate: Optional[str] = None, large_dtypes: bool = False, engine: Literal['pyarrow', 'rust'] = 'pyarrow', writer_properties: Optional[WriterProperties] = None, custom_metadata: Optional[Dict[str, str]] = None) -> None\n

    Write to a Delta Lake table

    If the table does not already exist, it will be created.

    The pyarrow writer supports protocol version 2 currently and won't be updated. For higher protocol support use engine='rust', this will become the default eventually.

    To enable safe concurrent writes when writing to S3, an additional locking mechanism must be supplied. For more information on enabling concurrent writing to S3, follow this guide

    Parameters:

    Name Type Description Default table_or_uri Union[str, Path, DeltaTable]

    URI of a table or a DeltaTable object.

    required data Union[DataFrame, Dataset, Table, RecordBatch, Iterable[RecordBatch], RecordBatchReader]

    Data to write. If passing iterable, the schema must also be given.

    required schema Optional[Union[Schema, Schema]]

    Optional schema to write.

    None partition_by Optional[Union[List[str], str]]

    List of columns to partition the table by. Only required when creating a new table.

    None mode Literal['error', 'append', 'overwrite', 'ignore']

    How to handle existing data. Default is to error if table already exists. If 'append', will add new data. If 'overwrite', will replace table with new data. If 'ignore', will not write anything if table already exists.

    'error' file_options Optional[ParquetFileWriteOptions]

    Optional write options for Parquet (ParquetFileWriteOptions). Can be provided with defaults using ParquetFileWriteOptions().make_write_options(). Please refer to https://github.com/apache/arrow/blob/master/python/pyarrow/_dataset_parquet.pyx#L492-L533 for the list of available options. Only used in pyarrow engine.

    None max_partitions Optional[int]

    the maximum number of partitions that will be used. Only used in pyarrow engine.

    None max_open_files int

    Limits the maximum number of files that can be left open while writing. If an attempt is made to open too many files then the least recently used file will be closed. If this setting is set too low you may end up fragmenting your data into many small files. Only used in pyarrow engine.

    1024 max_rows_per_file int

    Maximum number of rows per file. If greater than 0 then this will limit how many rows are placed in any single file. Otherwise there will be no limit and one file will be created in each output directory unless files need to be closed to respect max_open_files min_rows_per_group: Minimum number of rows per group. When the value is set, the dataset writer will batch incoming data and only write the row groups to the disk when sufficient rows have accumulated. Only used in pyarrow engine.

    10 * 1024 * 1024 max_rows_per_group int

    Maximum number of rows per group. If the value is set, then the dataset writer may split up large incoming batches into multiple row groups. If this value is set, then min_rows_per_group should also be set.

    128 * 1024 name Optional[str]

    User-provided identifier for this table.

    None description Optional[str]

    User-provided description for this table.

    None configuration Optional[Mapping[str, Optional[str]]]

    A map containing configuration options for the metadata action.

    None schema_mode Optional[Literal['merge', 'overwrite']]

    If set to \"overwrite\", allows replacing the schema of the table. Set to \"merge\" to merge with existing schema.

    None storage_options Optional[Dict[str, str]]

    options passed to the native delta filesystem.

    None predicate Optional[str]

    When using Overwrite mode, replace data that matches a predicate. Only used in rust engine.

    None partition_filters Optional[List[Tuple[str, str, Any]]]

    the partition filters that will be used for partition overwrite. Only used in pyarrow engine.

    None large_dtypes bool

    If True, the data schema is kept in large_dtypes, has no effect on pandas dataframe input.

    False engine Literal['pyarrow', 'rust']

    writer engine to write the delta table. Rust engine is still experimental but you may see up to 4x performance improvements over pyarrow.

    'pyarrow' writer_properties Optional[WriterProperties]

    Pass writer properties to the Rust parquet writer.

    None custom_metadata Optional[Dict[str, str]]

    Custom metadata to add to the commitInfo.

    None","boost":10},{"location":"api/delta_writer/#deltalake.WriterProperties","title":"deltalake.WriterProperties dataclass","text":"
    WriterProperties(data_page_size_limit: Optional[int] = None, dictionary_page_size_limit: Optional[int] = None, data_page_row_count_limit: Optional[int] = None, write_batch_size: Optional[int] = None, max_row_group_size: Optional[int] = None, compression: Optional[Literal['UNCOMPRESSED', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD', 'LZ4_RAW']] = None, compression_level: Optional[int] = None)\n

    A Writer Properties instance for the Rust parquet writer.

    Create a Writer Properties instance for the Rust parquet writer:

    Parameters:

    Name Type Description Default data_page_size_limit Optional[int]

    Limit DataPage size to this in bytes.

    None dictionary_page_size_limit Optional[int]

    Limit the size of each DataPage to store dicts to this amount in bytes.

    None data_page_row_count_limit Optional[int]

    Limit the number of rows in each DataPage.

    None write_batch_size Optional[int]

    Splits internally to smaller batch size.

    None max_row_group_size Optional[int]

    Max number of rows in row group.

    None compression Optional[Literal['UNCOMPRESSED', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD', 'LZ4_RAW']]

    compression type.

    None compression_level Optional[int]

    If none and compression has a level, the default level will be used, only relevant for GZIP: levels (1-9), BROTLI: levels (1-11), ZSTD: levels (1-22),

    None","boost":10},{"location":"api/delta_writer/#convert-to-delta-tables","title":"Convert to Delta Tables","text":"","boost":10},{"location":"api/delta_writer/#deltalake.convert_to_deltalake","title":"deltalake.convert_to_deltalake","text":"
    convert_to_deltalake(uri: Union[str, Path], mode: Literal['error', 'ignore'] = 'error', partition_by: Optional[pa.Schema] = None, partition_strategy: Optional[Literal['hive']] = None, name: Optional[str] = None, description: Optional[str] = None, configuration: Optional[Mapping[str, Optional[str]]] = None, storage_options: Optional[Dict[str, str]] = None, custom_metadata: Optional[Dict[str, str]] = None) -> None\n

    Convert parquet tables to delta tables.

    Currently only HIVE partitioned tables are supported. Convert to delta creates a transaction log commit with add actions, and additional properties provided such as configuration, name, and description.

    Parameters:

    Name Type Description Default uri Union[str, Path]

    URI of a table.

    required partition_by Optional[Schema]

    Optional partitioning schema if table is partitioned.

    None partition_strategy Optional[Literal['hive']]

    Optional partition strategy to read and convert

    None mode Literal['error', 'ignore']

    How to handle existing data. Default is to error if table already exists. If 'ignore', will not convert anything if table already exists.

    'error' name Optional[str]

    User-provided identifier for this table.

    None description Optional[str]

    User-provided description for this table.

    None configuration Optional[Mapping[str, Optional[str]]]

    A map containing configuration options for the metadata action.

    None storage_options Optional[Dict[str, str]]

    options passed to the native delta filesystem. Unused if 'filesystem' is defined.

    None custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit

    None","boost":10},{"location":"api/exceptions/","title":"Exceptions","text":"","boost":2},{"location":"api/exceptions/#deltalake.exceptions.DeltaError","title":"deltalake.exceptions.DeltaError","text":"

    Bases: builtins.Exception

    The base class for Delta-specific errors.

    ","boost":2},{"location":"api/exceptions/#deltalake.exceptions.DeltaProtocolError","title":"deltalake.exceptions.DeltaProtocolError","text":"

    Bases: _internal.DeltaError

    Raised when a violation with the Delta protocol specs ocurred.

    ","boost":2},{"location":"api/exceptions/#deltalake.exceptions.TableNotFoundError","title":"deltalake.exceptions.TableNotFoundError","text":"

    Bases: _internal.DeltaError

    Raised when a Delta table cannot be loaded from a location.

    ","boost":2},{"location":"api/exceptions/#deltalake.exceptions.CommitFailedError","title":"deltalake.exceptions.CommitFailedError","text":"

    Bases: _internal.DeltaError

    Raised when a commit to a Delta table fails.

    ","boost":2},{"location":"api/schema/","title":"Schema","text":"","boost":2},{"location":"api/schema/#schema-and-field","title":"Schema and field","text":"

    Schemas, fields, and data types are provided in the deltalake.schema submodule.

    ","boost":2},{"location":"api/schema/#deltalake.Schema","title":"deltalake.Schema","text":"
    Schema(fields: List[Field])\n

    Bases: deltalake._internal.StructType

    A Delta Lake schema

    Create using a list of :class:Field:

    Schema([Field(\"x\", \"integer\"), Field(\"y\", \"string\")]) Schema([Field(x, PrimitiveType(\"integer\"), nullable=True), Field(y, PrimitiveType(\"string\"), nullable=True)])

    Or create from a PyArrow schema:

    import pyarrow as pa Schema.from_pyarrow(pa.schema({\"x\": pa.int32(), \"y\": pa.string()})) Schema([Field(x, PrimitiveType(\"integer\"), nullable=True), Field(y, PrimitiveType(\"string\"), nullable=True)])

    ","boost":2},{"location":"api/schema/#deltalake.Schema.invariants","title":"invariants","text":"
    invariants: List[Tuple[str, str]] = <attribute 'invariants' of 'deltalake._internal.Schema' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.Schema.from_json","title":"from_json staticmethod","text":"
    from_json(schema_json) -> Schema\n

    Create a new Schema from a JSON string.

    Parameters:

    Name Type Description Default json str

    a JSON string

    required Example

    A schema has the same JSON format as a StructType.

    Schema.from_json('''{\n    \"type\": \"struct\",\n    \"fields\": [{\"name\": \"x\", \"type\": \"integer\", \"nullable\": true, \"metadata\": {}}]\n    }\n)'''\n# Returns Schema([Field(x, PrimitiveType(\"integer\"), nullable=True)])\n

    ","boost":2},{"location":"api/schema/#deltalake.Schema.from_pyarrow","title":"from_pyarrow staticmethod","text":"
    from_pyarrow(data_type) -> Schema\n

    Create a Schema from a PyArrow Schema type

    Will raise TypeError if the PyArrow type is not a primitive type.

    Parameters:

    Name Type Description Default type Schema

    A PyArrow Schema

    required

    Returns:

    Type Description Schema

    a Schema

    ","boost":2},{"location":"api/schema/#deltalake.Schema.to_json","title":"to_json method descriptor","text":"
    to_json() -> str\n

    Get the JSON string representation of the Schema.

    Returns:

    Type Description str

    a JSON string

    Example

    A schema has the same JSON format as a StructType.

    Schema([Field(\"x\", \"integer\")]).to_json()\n# Returns '{\"type\":\"struct\",\"fields\":[{\"name\":\"x\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}'\n

    ","boost":2},{"location":"api/schema/#deltalake.Schema.to_pyarrow","title":"to_pyarrow method descriptor","text":"
    to_pyarrow(as_large_types: bool = False) -> pyarrow.Schema\n

    Return equivalent PyArrow schema

    Parameters:

    Name Type Description Default as_large_types bool

    get schema with all variable size types (list, binary, string) as large variants (with int64 indices). This is for compatibility with systems like Polars that only support the large versions of Arrow types.

    False

    Returns:

    Type Description Schema

    a PyArrow Schema

    ","boost":2},{"location":"api/schema/#deltalake.Field","title":"deltalake.Field","text":"
    Field(name: str, type: DataType, *, nullable: bool = True, metadata: Optional[Dict[str, Any]] = None)\n
    ","boost":2},{"location":"api/schema/#deltalake.Field.metadata","title":"metadata","text":"
    metadata: Dict[str, Any] = <attribute 'metadata' of 'deltalake._internal.Field' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.Field.name","title":"name","text":"
    name: str = <attribute 'name' of 'deltalake._internal.Field' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.Field.nullable","title":"nullable","text":"
    nullable: bool = <attribute 'nullable' of 'deltalake._internal.Field' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.Field.type","title":"type","text":"
    type: DataType = <attribute 'type' of 'deltalake._internal.Field' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.Field.from_json","title":"from_json staticmethod","text":"
    from_json(field_json) -> Field\n

    Create a Field from a JSON string.

    Parameters:

    Name Type Description Default json str

    the JSON string.

    required

    Returns:

    Type Description Field

    Field

    Example
    Field.from_json('''{\n        \"name\": \"col\",\n        \"type\": \"integer\",\n        \"nullable\": true,\n        \"metadata\": {}\n    }'''\n)\n# Returns Field(col, PrimitiveType(\"integer\"), nullable=True)\n
    ","boost":2},{"location":"api/schema/#deltalake.Field.from_pyarrow","title":"from_pyarrow staticmethod","text":"
    from_pyarrow(field: pyarrow.Field) -> Field\n

    Create a Field from a PyArrow field Note: This currently doesn't preserve field metadata.

    Parameters:

    Name Type Description Default field Field

    a PyArrow Field

    required

    Returns:

    Type Description Field

    a Field

    ","boost":2},{"location":"api/schema/#deltalake.Field.to_json","title":"to_json method descriptor","text":"
    to_json() -> str\n

    Get the field as JSON string.

    Returns:

    Type Description str

    a JSON string

    Example
    Field(\"col\", \"integer\").to_json()\n# Returns '{\"name\":\"col\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}'\n
    ","boost":2},{"location":"api/schema/#deltalake.Field.to_pyarrow","title":"to_pyarrow method descriptor","text":"
    to_pyarrow() -> pyarrow.Field\n

    Convert to an equivalent PyArrow field Note: This currently doesn't preserve field metadata.

    Returns:

    Type Description Field

    a pyarrow Field

    ","boost":2},{"location":"api/schema/#data-types","title":"Data types","text":"","boost":2},{"location":"api/schema/#deltalake.schema.PrimitiveType","title":"deltalake.schema.PrimitiveType","text":"
    PrimitiveType(data_type: str)\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.PrimitiveType.type","title":"type","text":"
    type: str = <attribute 'type' of 'deltalake._internal.PrimitiveType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.PrimitiveType.from_json","title":"from_json staticmethod","text":"
    from_json(type_json) -> PrimitiveType\n

    Create a PrimitiveType from a JSON string

    The JSON representation for a primitive type is just a quoted string: PrimitiveType.from_json('\"integer\"')

    Parameters:

    Name Type Description Default json str

    a JSON string

    required

    Returns:

    Type Description PrimitiveType

    a PrimitiveType type

    ","boost":2},{"location":"api/schema/#deltalake.schema.PrimitiveType.from_pyarrow","title":"from_pyarrow staticmethod","text":"
    from_pyarrow(data_type) -> PrimitiveType\n

    Create a PrimitiveType from a PyArrow datatype

    Will raise TypeError if the PyArrow type is not a primitive type.

    Parameters:

    Name Type Description Default type DataType

    A PyArrow DataType

    required

    Returns:

    Type Description PrimitiveType

    a PrimitiveType

    ","boost":2},{"location":"api/schema/#deltalake.schema.PrimitiveType.to_pyarrow","title":"to_pyarrow method descriptor","text":"
    to_pyarrow() -> pyarrow.DataType\n

    Get the equivalent PyArrow type (pyarrow.DataType)

    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType","title":"deltalake.schema.ArrayType","text":"
    ArrayType(element_type: DataType, *, contains_null: bool = True)\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType.contains_null","title":"contains_null","text":"
    contains_null: bool = <attribute 'contains_null' of 'deltalake._internal.ArrayType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType.element_type","title":"element_type","text":"
    element_type: DataType = <attribute 'element_type' of 'deltalake._internal.ArrayType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType.type","title":"type","text":"
    type: Literal['array'] = <attribute 'type' of 'deltalake._internal.ArrayType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType.from_json","title":"from_json staticmethod","text":"
    from_json(type_json) -> ArrayType\n

    Create an ArrayType from a JSON string

    Parameters:

    Name Type Description Default json str

    a JSON string

    required

    Returns:

    Type Description ArrayType

    an ArrayType

    Example

    The JSON representation for an array type is an object with type (set to \"array\"), elementType, and containsNull.

    ArrayType.from_json(\n    '''{\n        \"type\": \"array\",\n        \"elementType\": \"integer\",\n        \"containsNull\": false\n    }'''\n)\n# Returns ArrayType(PrimitiveType(\"integer\"), contains_null=False)\n

    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType.from_pyarrow","title":"from_pyarrow staticmethod","text":"
    from_pyarrow(data_type) -> ArrayType\n

    Create an ArrayType from a pyarrow.ListType.

    Will raise TypeError if a different PyArrow DataType is provided.

    Parameters:

    Name Type Description Default type ListType

    The PyArrow ListType

    required

    Returns:

    Type Description ArrayType

    an ArrayType

    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType.to_json","title":"to_json method descriptor","text":"
    to_json() -> str\n

    Get the JSON string representation of the type.

    ","boost":2},{"location":"api/schema/#deltalake.schema.ArrayType.to_pyarrow","title":"to_pyarrow method descriptor","text":"
    to_pyarrow() -> pyarrow.ListType\n

    Get the equivalent PyArrow type.

    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType","title":"deltalake.schema.MapType","text":"
    MapType(key_type: DataType, value_type: DataType, *, value_contains_null: bool = True)\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.key_type","title":"key_type","text":"
    key_type: DataType = <attribute 'key_type' of 'deltalake._internal.MapType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.type","title":"type","text":"
    type: Literal['map'] = <attribute 'type' of 'deltalake._internal.MapType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.value_contains_null","title":"value_contains_null","text":"
    value_contains_null: bool = <attribute 'value_contains_null' of 'deltalake._internal.MapType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.value_type","title":"value_type","text":"
    value_type: DataType = <attribute 'value_type' of 'deltalake._internal.MapType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.from_json","title":"from_json staticmethod","text":"
    from_json(type_json) -> MapType\n

    Create a MapType from a JSON string

    Parameters:

    Name Type Description Default json str

    a JSON string

    required

    Returns:

    Type Description MapType

    an ArrayType

    Example

    The JSON representation for a map type is an object with type (set to map), keyType, valueType, and valueContainsNull:

    MapType.from_json(\n    '''{\n        \"type\": \"map\",\n        \"keyType\": \"integer\",\n        \"valueType\": \"string\",\n        \"valueContainsNull\": true\n    }'''\n)\n# Returns MapType(PrimitiveType(\"integer\"), PrimitiveType(\"string\"), value_contains_null=True)\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.from_pyarrow","title":"from_pyarrow staticmethod","text":"
    from_pyarrow(data_type) -> MapType\n

    Create a MapType from a PyArrow MapType.

    Will raise TypeError if passed a different type.

    Parameters:

    Name Type Description Default type MapType

    the PyArrow MapType

    required

    Returns:

    Type Description MapType

    a MapType

    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.to_json","title":"to_json method descriptor","text":"
    to_json() -> str\n

    Get JSON string representation of map type.

    Returns:

    Type Description str

    a JSON string

    ","boost":2},{"location":"api/schema/#deltalake.schema.MapType.to_pyarrow","title":"to_pyarrow method descriptor","text":"
    to_pyarrow() -> pyarrow.MapType\n

    Get the equivalent PyArrow data type.

    ","boost":2},{"location":"api/schema/#deltalake.schema.StructType","title":"deltalake.schema.StructType","text":"
    StructType(fields: List[Field])\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.StructType.fields","title":"fields","text":"
    fields: List[Field] = <attribute 'fields' of 'deltalake._internal.StructType' objects>\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.StructType.type","title":"type","text":"
    type: Literal['struct'] = <attribute 'type' of 'deltalake._internal.StructType' objects>\n

    The string \"struct\"

    ","boost":2},{"location":"api/schema/#deltalake.schema.StructType.from_json","title":"from_json staticmethod","text":"
    from_json(type_json) -> StructType\n

    Create a new StructType from a JSON string.

    Parameters:

    Name Type Description Default json str

    a JSON string

    required

    Returns:

    Type Description StructType

    a StructType

    Example
    StructType.from_json(\n    '''{\n        \"type\": \"struct\",\n        \"fields\": [{\"name\": \"x\", \"type\": \"integer\", \"nullable\": true, \"metadata\": {}}]\n    }'''\n)\n# Returns StructType([Field(x, PrimitiveType(\"integer\"), nullable=True)])\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.StructType.from_pyarrow","title":"from_pyarrow staticmethod","text":"
    from_pyarrow(data_type) -> StructType\n

    Create a new StructType from a PyArrow struct type.

    Will raise TypeError if a different data type is provided.

    Parameters:

    Name Type Description Default type StructType

    a PyArrow struct type.

    required

    Returns:

    Type Description StructType

    a StructType

    ","boost":2},{"location":"api/schema/#deltalake.schema.StructType.to_json","title":"to_json method descriptor","text":"
    to_json() -> str\n

    Get the JSON representation of the type.

    Returns:

    Type Description str

    a JSON string

    Example
    StructType([Field(\"x\", \"integer\")]).to_json()\n# Returns '{\"type\":\"struct\",\"fields\":[{\"name\":\"x\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}'\n
    ","boost":2},{"location":"api/schema/#deltalake.schema.StructType.to_pyarrow","title":"to_pyarrow method descriptor","text":"
    to_pyarrow() -> pyarrow.StructType\n

    Get the equivalent PyArrow StructType

    Returns:

    Type Description StructType

    a PyArrow StructType

    ","boost":2},{"location":"api/storage/","title":"Storage","text":"

    The delta filesystem handler for the pyarrow engine writer.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler","title":"deltalake.fs.DeltaStorageHandler","text":"
    DeltaStorageHandler(table_uri: str, options: Optional[Dict[str, str]] = None, known_sizes: Optional[Dict[str, int]] = None)\n

    Bases: FileSystemHandler

    DeltaStorageHandler is a concrete implementations of a PyArrow FileSystemHandler.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.copy_file","title":"copy_file","text":"
    copy_file(src: str, dst: str) -> None\n

    Copy a file.

    If the destination exists and is a directory, an error is returned. Otherwise, it is replaced.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.create_dir","title":"create_dir","text":"
    create_dir(path: str, recursive: bool = True) -> None\n

    Create a directory and subdirectories.

    This function succeeds if the directory already exists.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.delete_dir","title":"delete_dir","text":"
    delete_dir(path: str) -> None\n

    Delete a directory and its contents, recursively.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.delete_dir_contents","title":"delete_dir_contents","text":"
    delete_dir_contents(path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False) -> None\n

    Delete a directory's contents, recursively.

    Like delete_dir, but doesn't delete the directory itself.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.delete_file","title":"delete_file","text":"
    delete_file(path: str) -> None\n

    Delete a file.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.delete_root_dir_contents","title":"delete_root_dir_contents","text":"
    delete_root_dir_contents() -> None\n

    Delete the root directory contents, recursively.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.get_file_info","title":"get_file_info","text":"
    get_file_info(paths: List[str]) -> List[FileInfo]\n

    Get info for the given files.

    A non-existing or unreachable file returns a FileStat object and has a FileType of value NotFound. An exception indicates a truly exceptional condition (low-level I/O error, etc.).

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.get_file_info_selector","title":"get_file_info_selector","text":"
    get_file_info_selector(selector: FileSelector) -> List[FileInfo]\n

    Get info for the files defined by FileSelector.

    Parameters:

    Name Type Description Default selector FileSelector

    FileSelector object

    required

    Returns:

    Type Description List[FileInfo]

    list of file info objects

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.move","title":"move","text":"
    move(src: str, dest: str) -> None\n

    Move / rename a file or directory.

    If the destination exists: - if it is a non-empty directory, an error is returned - otherwise, if it has the same type as the source, it is replaced - otherwise, behavior is unspecified (implementation-dependent).

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.normalize_path","title":"normalize_path","text":"
    normalize_path(path: str) -> str\n

    Normalize filesystem path.

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.open_input_file","title":"open_input_file","text":"
    open_input_file(path: str) -> pa.PythonFile\n

    Open an input file for random access reading.

    Parameters:

    Name Type Description Default path str

    The source to open for reading.

    required

    Returns:

    Type Description PythonFile

    NativeFile

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.open_input_stream","title":"open_input_stream","text":"
    open_input_stream(path: str) -> pa.PythonFile\n

    Open an input stream for sequential reading.

    Parameters:

    Name Type Description Default path str

    The source to open for reading.

    required

    Returns:

    Type Description PythonFile

    NativeFile

    ","boost":2},{"location":"api/storage/#deltalake.fs.DeltaStorageHandler.open_output_stream","title":"open_output_stream","text":"
    open_output_stream(path: str, metadata: Optional[Dict[str, str]] = None) -> pa.PythonFile\n

    Open an output stream for sequential writing.

    If the target already exists, existing data is truncated.

    Parameters:

    Name Type Description Default path str

    The source to open for writing.

    required metadata Optional[Dict[str, str]]

    If not None, a mapping of string keys to string values.

    None

    Returns:

    Type Description PythonFile

    NativeFile

    ","boost":2},{"location":"api/delta_table/","title":"DeltaTable","text":"","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable","title":"deltalake.DeltaTable dataclass","text":"
    DeltaTable(table_uri: Union[str, Path, os.PathLike[str]], version: Optional[int] = None, storage_options: Optional[Dict[str, str]] = None, without_files: bool = False, log_buffer_size: Optional[int] = None)\n

    Represents a Delta Table

    Create the Delta Table from a path with an optional version. Multiple StorageBackends are currently supported: AWS S3, Azure Data Lake Storage Gen2, Google Cloud Storage (GCS) and local URI. Depending on the storage backend used, you could provide options values using the storage_options parameter.

    Parameters:

    Name Type Description Default table_uri Union[str, Path, PathLike[str]]

    the path of the DeltaTable

    required version Optional[int]

    version of the DeltaTable

    None storage_options Optional[Dict[str, str]]

    a dictionary of the options to use for the storage backend

    None without_files bool

    If True, will load table without tracking files. Some append-only applications might have no need of tracking any files. So, the DeltaTable will be loaded with a significant memory reduction.

    False log_buffer_size Optional[int]

    Number of files to buffer when reading the commit log. A positive integer. Setting a value greater than 1 results in concurrent calls to the storage api. This can decrease latency if there are many files in the log since the last checkpoint, but will also increase memory usage. Possible rate limits of the storage backend should also be considered for optimal performance. Defaults to 4 * number of cpus.

    None","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.alter","title":"alter property","text":"
    alter: TableAlterer\n

    Namespace for all table alter related methods.

    Returns:

    Name Type Description TableAlterer TableAlterer

    TableAlterer Object

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.optimize","title":"optimize property","text":"
    optimize: TableOptimizer\n

    Namespace for all table optimize related methods.

    Returns:

    Name Type Description TableOptimizer TableOptimizer

    TableOptimizer Object

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.cleanup_metadata","title":"cleanup_metadata","text":"
    cleanup_metadata() -> None\n

    Delete expired log files before current version from table. The table log retention is based on the configuration.logRetentionDuration value, 30 days by default.

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.create","title":"create classmethod","text":"
    create(table_uri: Union[str, Path], schema: Union[pyarrow.Schema, DeltaSchema], mode: Literal['error', 'append', 'overwrite', 'ignore'] = 'error', partition_by: Optional[Union[List[str], str]] = None, name: Optional[str] = None, description: Optional[str] = None, configuration: Optional[Mapping[str, Optional[str]]] = None, storage_options: Optional[Dict[str, str]] = None, custom_metadata: Optional[Dict[str, str]] = None, raise_if_key_not_exists: bool = True) -> DeltaTable\n

    CREATE or CREATE_OR_REPLACE a delta table given a table_uri.

    Parameters:

    Name Type Description Default table_uri Union[str, Path]

    URI of a table

    required schema Union[Schema, Schema]

    Table schema

    required mode Literal['error', 'append', 'overwrite', 'ignore']

    How to handle existing data. Default is to error if table already exists. If 'append', returns not support error if table exists. If 'overwrite', will CREATE_OR_REPLACE table. If 'ignore', will not do anything if table already exists. Defaults to \"error\".

    'error' partition_by Optional[Union[List[str], str]]

    List of columns to partition the table by.

    None name Optional[str]

    User-provided identifier for this table.

    None description Optional[str]

    User-provided description for this table.

    None configuration Optional[Mapping[str, Optional[str]]]

    A map containing configuration options for the metadata action.

    None storage_options Optional[Dict[str, str]]

    Options passed to the object store crate.

    None custom_metadata Optional[Dict[str, str]]

    Custom metadata that will be added to the transaction commit.

    None raise_if_key_not_exists bool

    Whether to raise an error if the configuration uses keys that are not Delta keys

    True

    Returns:

    Name Type Description DeltaTable DeltaTable

    created delta table

    Example
    import pyarrow as pa\n\nfrom deltalake import DeltaTable\n\ndt = DeltaTable.create(\n    table_uri=\"my_local_table\",\n    schema=pa.schema(\n        [pa.field(\"foo\", pa.string()), pa.field(\"bar\", pa.string())]\n    ),\n    mode=\"error\",\n    partition_by=\"bar\",\n)\n
    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.delete","title":"delete","text":"
    delete(predicate: Optional[str] = None, writer_properties: Optional[WriterProperties] = None, custom_metadata: Optional[Dict[str, str]] = None) -> Dict[str, Any]\n

    Delete records from a Delta Table that statisfy a predicate.

    When a predicate is not provided then all records are deleted from the Delta Table. Otherwise a scan of the Delta table is performed to mark any files that contain records that satisfy the predicate. Once files are determined they are rewritten without the records.

    Parameters:

    Name Type Description Default predicate Optional[str]

    a SQL where clause. If not passed, will delete all rows.

    None writer_properties Optional[WriterProperties]

    Pass writer properties to the Rust parquet writer.

    None custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns:

    Type Description Dict[str, Any]

    the metrics from delete.

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.file_uris","title":"file_uris","text":"
    file_uris(partition_filters: Optional[List[Tuple[str, str, Any]]] = None) -> List[str]\n

    Get the list of files as absolute URIs, including the scheme (e.g. \"s3://\").

    Local files will be just plain absolute paths, without a scheme. (That is, no 'file://' prefix.)

    Use the partition_filters parameter to retrieve a subset of files that match the given filters.

    Parameters:

    Name Type Description Default partition_filters Optional[List[Tuple[str, str, Any]]]

    the partition filters that will be used for getting the matched files

    None

    Returns:

    Type Description List[str]

    list of the .parquet files with an absolute URI referenced for the current version of the DeltaTable

    Predicates are expressed in disjunctive normal form (DNF), like [(\"x\", \"=\", \"a\"), ...]. DNF allows arbitrary boolean logical combinations of single partition predicates. The innermost tuples each describe a single partition predicate. The list of inner predicates is interpreted as a conjunction (AND), forming a more selective and multiple partition predicates. Each tuple has format: (key, op, value) and compares the key with the value. The supported op are: =, !=, in, and not in. If the op is in or not in, the value must be a collection such as a list, a set or a tuple. The supported type for value is str. Use empty string '' for Null partition value.

    Example
    (\"x\", \"=\", \"a\")\n(\"x\", \"!=\", \"a\")\n(\"y\", \"in\", [\"a\", \"b\", \"c\"])\n(\"z\", \"not in\", [\"a\",\"b\"])\n
    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.files","title":"files","text":"
    files(partition_filters: Optional[List[Tuple[str, str, Any]]] = None) -> List[str]\n

    Get the .parquet files of the DeltaTable.

    The paths are as they are saved in the delta log, which may either be relative to the table root or absolute URIs.

    Parameters:

    Name Type Description Default partition_filters Optional[List[Tuple[str, str, Any]]]

    the partition filters that will be used for getting the matched files

    None

    Returns:

    Type Description List[str]

    list of the .parquet files referenced for the current version of the DeltaTable

    Predicates are expressed in disjunctive normal form (DNF), like [(\"x\", \"=\", \"a\"), ...]. DNF allows arbitrary boolean logical combinations of single partition predicates. The innermost tuples each describe a single partition predicate. The list of inner predicates is interpreted as a conjunction (AND), forming a more selective and multiple partition predicates. Each tuple has format: (key, op, value) and compares the key with the value. The supported op are: =, !=, in, and not in. If the op is in or not in, the value must be a collection such as a list, a set or a tuple. The supported type for value is str. Use empty string '' for Null partition value.

    Example
    (\"x\", \"=\", \"a\")\n(\"x\", \"!=\", \"a\")\n(\"y\", \"in\", [\"a\", \"b\", \"c\"])\n(\"z\", \"not in\", [\"a\",\"b\"])\n
    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.files_by_partitions","title":"files_by_partitions","text":"
    files_by_partitions(partition_filters: Optional[FilterType]) -> List[str]\n

    Get the files for each partition

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.from_data_catalog","title":"from_data_catalog classmethod","text":"
    from_data_catalog(data_catalog: DataCatalog, database_name: str, table_name: str, data_catalog_id: Optional[str] = None, version: Optional[int] = None, log_buffer_size: Optional[int] = None) -> DeltaTable\n

    Create the Delta Table from a Data Catalog.

    Parameters:

    Name Type Description Default data_catalog DataCatalog

    the Catalog to use for getting the storage location of the Delta Table

    required database_name str

    the database name inside the Data Catalog

    required table_name str

    the table name inside the Data Catalog

    required data_catalog_id Optional[str]

    the identifier of the Data Catalog

    None version Optional[int]

    version of the DeltaTable

    None log_buffer_size Optional[int]

    Number of files to buffer when reading the commit log. A positive integer. Setting a value greater than 1 results in concurrent calls to the storage api. This can decrease latency if there are many files in the log since the last checkpoint, but will also increase memory usage. Possible rate limits of the storage backend should also be considered for optimal performance. Defaults to 4 * number of cpus.

    None","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.get_add_actions","title":"get_add_actions","text":"
    get_add_actions(flatten: bool = False) -> pyarrow.RecordBatch\n

    Return a dataframe with all current add actions.

    Add actions represent the files that currently make up the table. This data is a low-level representation parsed from the transaction log.

    Parameters:

    Name Type Description Default flatten bool

    whether to flatten the schema. Partition values columns are given the prefix partition., statistics (null_count, min, and max) are given the prefix null_count., min., and max., and tags the prefix tags.. Nested field names are concatenated with ..

    False

    Returns:

    Type Description RecordBatch

    a PyArrow RecordBatch containing the add action data.

    Example
    from pprint import pprint\nfrom deltalake import DeltaTable, write_deltalake\nimport pyarrow as pa\ndata = pa.table({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\nwrite_deltalake(\"tmp\", data, partition_by=[\"x\"])\ndt = DeltaTable(\"tmp\")\ndf = dt.get_add_actions().to_pandas()\ndf[\"path\"].sort_values(ignore_index=True)\n0    x=1/0\n1    x=2/0\n2    x=3/0\n
    df = dt.get_add_actions(flatten=True).to_pandas()\ndf[\"partition.x\"].sort_values(ignore_index=True)\n0    1\n1    2\n2    3\n
    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.history","title":"history","text":"
    history(limit: Optional[int] = None) -> List[Dict[str, Any]]\n

    Run the history command on the DeltaTable. The operations are returned in reverse chronological order.

    Parameters:

    Name Type Description Default limit Optional[int]

    the commit info limit to return

    None

    Returns:

    Type Description List[Dict[str, Any]]

    list of the commit infos registered in the transaction log

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.load_as_version","title":"load_as_version","text":"
    load_as_version(version: Union[int, str, datetime]) -> None\n

    Load/time travel a DeltaTable to a specified version number, or a timestamp version of the table. If a string is passed then the argument should be an RFC 3339 and ISO 8601 date and time string format. If a datetime object without a timezone is passed, the UTC timezone will be assumed.

    Parameters:

    Name Type Description Default version Union[int, str, datetime]

    the identifier of the version of the DeltaTable to load

    required Example

    Use a version number

    dt = DeltaTable(\"test_table\")\ndt.load_as_version(1)\n

    Use a datetime object

    dt.load_as_version(datetime(2023, 1, 1))\ndt.load_as_version(datetime(2023, 1, 1, tzinfo=timezone.utc))\n

    Use a datetime in string format

    dt.load_as_version(\"2018-01-26T18:30:09Z\")\ndt.load_as_version(\"2018-12-19T16:39:57-08:00\")\ndt.load_as_version(\"2018-01-26T18:30:09.453+00:00\")\n

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.load_version","title":"load_version","text":"
    load_version(version: int) -> None\n

    Load a DeltaTable with a specified version.

    Deprecated

    Load_version and load_with_datetime have been combined into DeltaTable.load_as_version.

    Parameters:

    Name Type Description Default version int

    the identifier of the version of the DeltaTable to load

    required","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.load_with_datetime","title":"load_with_datetime","text":"
    load_with_datetime(datetime_string: str) -> None\n

    Time travel Delta table to the latest version that's created at or before provided datetime_string argument. The datetime_string argument should be an RFC 3339 and ISO 8601 date and time string.

    Deprecated

    Load_version and load_with_datetime have been combined into DeltaTable.load_as_version.

    Parameters:

    Name Type Description Default datetime_string str

    the identifier of the datetime point of the DeltaTable to load

    required Example
    \"2018-01-26T18:30:09Z\"\n\"2018-12-19T16:39:57-08:00\"\n\"2018-01-26T18:30:09.453+00:00\"\n
    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.merge","title":"merge","text":"
    merge(source: Union[pyarrow.Table, pyarrow.RecordBatch, pyarrow.RecordBatchReader, ds.Dataset, pd.DataFrame], predicate: str, source_alias: Optional[str] = None, target_alias: Optional[str] = None, error_on_type_mismatch: bool = True, writer_properties: Optional[WriterProperties] = None, large_dtypes: bool = True, custom_metadata: Optional[Dict[str, str]] = None) -> TableMerger\n

    Pass the source data which you want to merge on the target delta table, providing a predicate in SQL query like format. You can also specify on what to do when the underlying data types do not match the underlying table.

    Parameters:

    Name Type Description Default source Union[Table, RecordBatch, RecordBatchReader, Dataset, DataFrame]

    source data

    required predicate str

    SQL like predicate on how to merge

    required source_alias Optional[str]

    Alias for the source table

    None target_alias Optional[str]

    Alias for the target table

    None error_on_type_mismatch bool

    specify if merge will return error if data types are mismatching :default = True

    True writer_properties Optional[WriterProperties]

    Pass writer properties to the Rust parquet writer

    None large_dtypes bool

    If True, the data schema is kept in large_dtypes.

    True custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.metadata","title":"metadata","text":"
    metadata() -> Metadata\n

    Get the current metadata of the DeltaTable.

    Returns:

    Type Description Metadata

    the current Metadata registered in the transaction log

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.protocol","title":"protocol","text":"
    protocol() -> ProtocolVersions\n

    Get the reader and writer protocol versions of the DeltaTable.

    Returns:

    Type Description ProtocolVersions

    the current ProtocolVersions registered in the transaction log

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.repair","title":"repair","text":"
    repair(dry_run: bool = False, custom_metadata: Optional[Dict[str, str]] = None) -> Dict[str, Any]\n

    Repair the Delta Table by auditing active files that do not exist in the underlying filesystem and removes them. This can be useful when there are accidental deletions or corrupted files.

    Active files are ones that have an add action in the log, but no corresponding remove action. This operation creates a new FSCK transaction containing a remove action for each of the missing or corrupted files.

    Parameters:

    Name Type Description Default dry_run bool

    when activated, list only the files, otherwise add remove actions to transaction log. Defaults to False.

    False custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns: The metrics from repair (FSCK) action.

    Example

    from deltalake import DeltaTable\ndt = DeltaTable('TEST')\ndt.repair(dry_run=False)\n
    Results in
    {'dry_run': False, 'files_removed': ['6-0d084325-6885-4847-b008-82c1cf30674c-0.parquet', 5-4fba1d3e-3e20-4de1-933d-a8e13ac59f53-0.parquet']}\n

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.restore","title":"restore","text":"
    restore(target: Union[int, datetime, str], *, ignore_missing_files: bool = False, protocol_downgrade_allowed: bool = False, custom_metadata: Optional[Dict[str, str]] = None) -> Dict[str, Any]\n

    Run the Restore command on the Delta Table: restore table to a given version or datetime.

    Parameters:

    Name Type Description Default target Union[int, datetime, str]

    the expected version will restore, which represented by int, date str or datetime.

    required ignore_missing_files bool

    whether the operation carry on when some data files missing.

    False protocol_downgrade_allowed bool

    whether the operation when protocol version upgraded.

    False custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns:

    Type Description Dict[str, Any]

    the metrics from restore.

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.schema","title":"schema","text":"
    schema() -> DeltaSchema\n

    Get the current schema of the DeltaTable.

    Returns:

    Type Description Schema

    the current Schema registered in the transaction log

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.to_pandas","title":"to_pandas","text":"
    to_pandas(partitions: Optional[List[Tuple[str, str, Any]]] = None, columns: Optional[List[str]] = None, filesystem: Optional[Union[str, pa_fs.FileSystem]] = None, filters: Optional[Union[FilterType, Expression]] = None) -> pd.DataFrame\n

    Build a pandas dataframe using data from the DeltaTable.

    Parameters:

    Name Type Description Default partitions Optional[List[Tuple[str, str, Any]]]

    A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax

    None columns Optional[List[str]]

    The columns to project. This can be a list of column names to include (order and duplicates will be preserved)

    None filesystem Optional[Union[str, FileSystem]]

    A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem

    None filters Optional[Union[FilterType, Expression]]

    A disjunctive normal form (DNF) predicate for filtering rows, or directly a pyarrow.dataset.Expression. If you pass a filter you do not need to pass partitions

    None","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.to_pyarrow_dataset","title":"to_pyarrow_dataset","text":"
    to_pyarrow_dataset(partitions: Optional[List[Tuple[str, str, Any]]] = None, filesystem: Optional[Union[str, pa_fs.FileSystem]] = None, parquet_read_options: Optional[ParquetReadOptions] = None, schema: Optional[pyarrow.Schema] = None, as_large_types: bool = False) -> pyarrow.dataset.Dataset\n

    Build a PyArrow Dataset using data from the DeltaTable.

    Parameters:

    Name Type Description Default partitions Optional[List[Tuple[str, str, Any]]]

    A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax

    None filesystem Optional[Union[str, FileSystem]]

    A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem

    None parquet_read_options Optional[ParquetReadOptions]

    Optional read options for Parquet. Use this to handle INT96 to timestamp conversion for edge cases like 0001-01-01 or 9999-12-31

    None schema Optional[Schema]

    The schema to use for the dataset. If None, the schema of the DeltaTable will be used. This can be used to force reading of Parquet/Arrow datatypes that DeltaLake can't represent in it's schema (e.g. LargeString). If you only need to read the schema with large types (e.g. for compatibility with Polars) you may want to use the as_large_types parameter instead.

    None as_large_types bool

    get schema with all variable size types (list, binary, string) as large variants (with int64 indices). This is for compatibility with systems like Polars that only support the large versions of Arrow types. If schema is passed it takes precedence over this option.

    False

    More info: https://arrow.apache.org/docs/python/generated/pyarrow.dataset.ParquetReadOptions.html

    Example

    deltalake will work with any storage compliant with :class:pyarrow.fs.FileSystem, however the root of the filesystem has to be adjusted to point at the root of the Delta table. We can achieve this by wrapping the custom filesystem into a :class:pyarrow.fs.SubTreeFileSystem.

    import pyarrow.fs as fs\nfrom deltalake import DeltaTable\n\ntable_uri = \"s3://<bucket>/<path>\"\nraw_fs, normalized_path = fs.FileSystem.from_uri(table_uri)\nfilesystem = fs.SubTreeFileSystem(normalized_path, raw_fs)\n\ndt = DeltaTable(table_uri)\nds = dt.to_pyarrow_dataset(filesystem=filesystem)\n

    Returns:

    Type Description Dataset

    the PyArrow dataset in PyArrow

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.to_pyarrow_table","title":"to_pyarrow_table","text":"
    to_pyarrow_table(partitions: Optional[List[Tuple[str, str, Any]]] = None, columns: Optional[List[str]] = None, filesystem: Optional[Union[str, pa_fs.FileSystem]] = None, filters: Optional[Union[FilterType, Expression]] = None) -> pyarrow.Table\n

    Build a PyArrow Table using data from the DeltaTable.

    Parameters:

    Name Type Description Default partitions Optional[List[Tuple[str, str, Any]]]

    A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax

    None columns Optional[List[str]]

    The columns to project. This can be a list of column names to include (order and duplicates will be preserved)

    None filesystem Optional[Union[str, FileSystem]]

    A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem

    None filters Optional[Union[FilterType, Expression]]

    A disjunctive normal form (DNF) predicate for filtering rows, or directly a pyarrow.dataset.Expression. If you pass a filter you do not need to pass partitions

    None","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.update","title":"update","text":"
    update(updates: Optional[Dict[str, str]] = None, new_values: Optional[Dict[str, Union[int, float, str, datetime, bool, List[Any]]]] = None, predicate: Optional[str] = None, writer_properties: Optional[WriterProperties] = None, error_on_type_mismatch: bool = True, custom_metadata: Optional[Dict[str, str]] = None) -> Dict[str, Any]\n

    UPDATE records in the Delta Table that matches an optional predicate. Either updates or new_values needs to be passed for it to execute.

    Parameters:

    Name Type Description Default updates Optional[Dict[str, str]]

    a mapping of column name to update SQL expression.

    None new_values Optional[Dict[str, Union[int, float, str, datetime, bool, List[Any]]]]

    a mapping of column name to python datatype.

    None predicate Optional[str]

    a logical expression.

    None writer_properties Optional[WriterProperties]

    Pass writer properties to the Rust parquet writer.

    None error_on_type_mismatch bool

    specify if update will return error if data types are mismatching :default = True

    True custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns: the metrics from update

    Example

    Update some row values with SQL predicate

    This is equivalent to UPDATE table SET deleted = true WHERE id = '3'

    from deltalake import write_deltalake, DeltaTable\nimport pandas as pd\ndf = pd.DataFrame(\n    {\"id\": [\"1\", \"2\", \"3\"],\n    \"deleted\": [False, False, False],\n    \"price\": [10., 15., 20.]\n    })\nwrite_deltalake(\"tmp\", df)\ndt = DeltaTable(\"tmp\")\ndt.update(predicate=\"id = '3'\", updates = {\"deleted\": 'True'})\n\n{'num_added_files': 1, 'num_removed_files': 1, 'num_updated_rows': 1, 'num_copied_rows': 2, 'execution_time_ms': ..., 'scan_time_ms': ...}\n

    Update all row values

    This is equivalent to UPDATE table SET deleted = true, id = concat(id, '_old').

    dt.update(updates = {\"deleted\": 'True', \"id\": \"concat(id, '_old')\"})\n\n{'num_added_files': 1, 'num_removed_files': 1, 'num_updated_rows': 3, 'num_copied_rows': 0, 'execution_time_ms': ..., 'scan_time_ms': ...}\n

    Use Python objects instead of SQL strings

    Use the new_values parameter instead of the updates parameter. For example, this is equivalent to UPDATE table SET price = 150.10 WHERE id = '1'

    dt.update(predicate=\"id = '1_old'\", new_values = {\"price\": 150.10})\n\n{'num_added_files': 1, 'num_removed_files': 1, 'num_updated_rows': 1, 'num_copied_rows': 2, 'execution_time_ms': ..., 'scan_time_ms': ...}\n

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.update_incremental","title":"update_incremental","text":"
    update_incremental() -> None\n

    Updates the DeltaTable to the latest version by incrementally applying newer versions.

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.vacuum","title":"vacuum","text":"
    vacuum(retention_hours: Optional[int] = None, dry_run: bool = True, enforce_retention_duration: bool = True, custom_metadata: Optional[Dict[str, str]] = None) -> List[str]\n

    Run the Vacuum command on the Delta Table: list and delete files no longer referenced by the Delta table and are older than the retention threshold.

    Parameters:

    Name Type Description Default retention_hours Optional[int]

    the retention threshold in hours, if none then the value from configuration.deletedFileRetentionDuration is used or default of 1 week otherwise.

    None dry_run bool

    when activated, list only the files, delete otherwise

    True enforce_retention_duration bool

    when disabled, accepts retention hours smaller than the value from configuration.deletedFileRetentionDuration.

    True custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns: the list of files no longer referenced by the Delta Table and are older than the retention threshold.

    ","boost":2},{"location":"api/delta_table/#deltalake.DeltaTable.version","title":"version","text":"
    version() -> int\n

    Get the version of the DeltaTable.

    Returns:

    Type Description int

    The current version of the DeltaTable

    ","boost":2},{"location":"api/delta_table/delta_table_alterer/","title":"TableAlterer","text":"","boost":10},{"location":"api/delta_table/delta_table_alterer/#deltalake.table.TableAlterer","title":"deltalake.table.TableAlterer","text":"
    TableAlterer(table: DeltaTable)\n

    API for various table alteration commands.

    ","boost":10},{"location":"api/delta_table/delta_table_alterer/#deltalake.table.TableAlterer.add_constraint","title":"add_constraint","text":"
    add_constraint(constraints: Dict[str, str], custom_metadata: Optional[Dict[str, str]] = None) -> None\n

    Add constraints to the table. Limited to single constraint at once.

    Parameters:

    Name Type Description Default constraints Dict[str, str]

    mapping of constraint name to SQL-expression to evaluate on write

    required custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Example:

    from deltalake import DeltaTable\ndt = DeltaTable(\"test_table_constraints\")\ndt.alter.add_constraint({\n    \"value_gt_5\": \"value > 5\",\n})\n

    **Check configuration**\n```\ndt.metadata().configuration\n{'delta.constraints.value_gt_5': 'value > 5'}\n```\n
    ","boost":10},{"location":"api/delta_table/delta_table_alterer/#deltalake.table.TableAlterer.drop_constraint","title":"drop_constraint","text":"
    drop_constraint(name: str, raise_if_not_exists: bool = True, custom_metadata: Optional[Dict[str, str]] = None) -> None\n

    Drop constraints from a table. Limited to single constraint at once.

    Parameters:

    Name Type Description Default name str

    constraint name which to drop.

    required raise_if_not_exists bool

    set if should raise if not exists.

    True custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Example:

    from deltalake import DeltaTable\ndt = DeltaTable(\"test_table_constraints\")\ndt.metadata().configuration\n{'delta.constraints.value_gt_5': 'value > 5'}\n

    **Drop the constraint**\n```python\ndt.alter.drop_constraint(name = \"value_gt_5\")\n```\n\n**Configuration after dropping**\n```python\ndt.metadata().configuration\n{}\n```\n
    ","boost":10},{"location":"api/delta_table/delta_table_alterer/#deltalake.table.TableAlterer.set_table_properties","title":"set_table_properties","text":"
    set_table_properties(properties: Dict[str, str], raise_if_not_exists: bool = True, custom_metadata: Optional[Dict[str, str]] = None) -> None\n

    Unset properties from the table. Args: properties: properties which set raise_if_not_exists: set if should raise if not exists. custom_metadata: custom metadata that will be added to the transaction commit. Example:

    ","boost":10},{"location":"api/delta_table/delta_table_merger/","title":"TableMerger","text":"","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger","title":"deltalake.table.TableMerger","text":"
    TableMerger(table: DeltaTable, source: pyarrow.RecordBatchReader, predicate: str, source_alias: Optional[str] = None, target_alias: Optional[str] = None, safe_cast: bool = True, writer_properties: Optional[WriterProperties] = None, custom_metadata: Optional[Dict[str, str]] = None)\n

    API for various table MERGE commands.

    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.execute","title":"execute","text":"
    execute() -> Dict[str, Any]\n

    Executes MERGE with the previously provided settings in Rust with Apache Datafusion query engine.

    Returns:

    Name Type Description Dict Dict[str, Any]

    metrics

    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.when_matched_delete","title":"when_matched_delete","text":"
    when_matched_delete(predicate: Optional[str] = None) -> TableMerger\n

    Delete a matched row from the table only if the given predicate (if specified) is true for the matched row. If not specified it deletes all matches.

    Note

    Column names with special characters, such as numbers or spaces should be encapsulated in backticks: \"target.123column\" or \"target.my column\"

    Parameters:

    Name Type Description Default predicate (str | None, Optional)

    SQL like predicate on when to delete.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    Example

    Delete on a predicate

    from deltalake import DeltaTable, write_deltalake\nimport pyarrow as pa\n\ndata = pa.table({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\nwrite_deltalake(\"tmp\", data)\ndt = DeltaTable(\"tmp\")\nnew_data = pa.table({\"x\": [2, 3], \"deleted\": [False, True]})\n\n(\n    dt.merge(\n        source=new_data,\n        predicate='target.x = source.x',\n        source_alias='source',\n        target_alias='target')\n    .when_matched_delete(\n        predicate=\"source.deleted = true\")\n    .execute()\n)\n{'num_source_rows': 2, 'num_target_rows_inserted': 0, 'num_target_rows_updated': 0, 'num_target_rows_deleted': 1, 'num_target_rows_copied': 2, 'num_output_rows': 2, 'num_target_files_added': 1, 'num_target_files_removed': 1, 'execution_time_ms': ..., 'scan_time_ms': ..., 'rewrite_time_ms': ...}\n\ndt.to_pandas().sort_values(\"x\", ignore_index=True)\n   x  y\n0  1  4\n1  2  5\n

    Delete all records that were matched

    dt = DeltaTable(\"tmp\")\n(\n    dt.merge(\n        source=new_data,\n        predicate='target.x = source.x',\n        source_alias='source',\n        target_alias='target')\n    .when_matched_delete()\n    .execute()\n)\n{'num_source_rows': 2, 'num_target_rows_inserted': 0, 'num_target_rows_updated': 0, 'num_target_rows_deleted': 1, 'num_target_rows_copied': 1, 'num_output_rows': 1, 'num_target_files_added': 1, 'num_target_files_removed': 1, 'execution_time_ms': ..., 'scan_time_ms': ..., 'rewrite_time_ms': ...}\n\ndt.to_pandas()\n   x  y\n0  1  4\n

    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.when_matched_update","title":"when_matched_update","text":"
    when_matched_update(updates: Dict[str, str], predicate: Optional[str] = None) -> TableMerger\n

    Update a matched table row based on the rules defined by updates. If a predicate is specified, then it must evaluate to true for the row to be updated.

    Note

    Column names with special characters, such as numbers or spaces should be encapsulated in backticks: \"target.123column\" or \"target.my column\"

    Parameters:

    Name Type Description Default updates Dict[str, str]

    a mapping of column name to update SQL expression.

    required predicate Optional[str]

    SQL like predicate on when to update.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    Example
    from deltalake import DeltaTable, write_deltalake\nimport pyarrow as pa\n\ndata = pa.table({\"x\": [1, 2, 3], \"1y\": [4, 5, 6]})\nwrite_deltalake(\"tmp\", data)\ndt = DeltaTable(\"tmp\")\nnew_data = pa.table({\"x\": [1], \"1y\": [7]})\n\n(\n     dt.merge(\n         source=new_data,\n         predicate=\"target.x = source.x\",\n         source_alias=\"source\",\n         target_alias=\"target\")\n     .when_matched_update(updates={\"x\": \"source.x\", \"`1y`\": \"source.`1y`\"})\n     .execute()\n)\n{'num_source_rows': 1, 'num_target_rows_inserted': 0, 'num_target_rows_updated': 1, 'num_target_rows_deleted': 0, 'num_target_rows_copied': 2, 'num_output_rows': 3, 'num_target_files_added': 1, 'num_target_files_removed': 1, 'execution_time_ms': ..., 'scan_time_ms': ..., 'rewrite_time_ms': ...}\n\ndt.to_pandas()\n   x  y\n0  1  7\n1  2  5\n2  3  6\n
    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.when_matched_update_all","title":"when_matched_update_all","text":"
    when_matched_update_all(predicate: Optional[str] = None) -> TableMerger\n

    Updating all source fields to target fields, source and target are required to have the same field names. If a predicate is specified, then it must evaluate to true for the row to be updated.

    Note

    Column names with special characters, such as numbers or spaces should be encapsulated in backticks: \"target.123column\" or \"target.my column\"

    Parameters:

    Name Type Description Default predicate Optional[str]

    SQL like predicate on when to update all columns.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    Example
    from deltalake import DeltaTable, write_deltalake\nimport pyarrow as pa\n\ndata = pa.table({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\nwrite_deltalake(\"tmp\", data)\ndt = DeltaTable(\"tmp\")\nnew_data = pa.table({\"x\": [1], \"y\": [7]})\n\n(\n    dt.merge(\n        source=new_data,\n        predicate=\"target.x = source.x\",\n        source_alias=\"source\",\n        target_alias=\"target\")\n    .when_matched_update_all()\n    .execute()\n)\n{'num_source_rows': 1, 'num_target_rows_inserted': 0, 'num_target_rows_updated': 1, 'num_target_rows_deleted': 0, 'num_target_rows_copied': 2, 'num_output_rows': 3, 'num_target_files_added': 1, 'num_target_files_removed': 1, 'execution_time_ms': ..., 'scan_time_ms': ..., 'rewrite_time_ms': ...}\n\ndt.to_pandas()\n   x  y\n0  1  7\n1  2  5\n2  3  6\n
    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.when_not_matched_by_source_delete","title":"when_not_matched_by_source_delete","text":"
    when_not_matched_by_source_delete(predicate: Optional[str] = None) -> TableMerger\n

    Delete a target row that has no matches in the source from the table only if the given predicate (if specified) is true for the target row.

    Note

    Column names with special characters, such as numbers or spaces should be encapsulated in backticks: \"target.123column\" or \"target.my column\"

    Parameters:

    Name Type Description Default predicate Optional[str]

    SQL like predicate on when to delete when not matched by source.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.when_not_matched_by_source_update","title":"when_not_matched_by_source_update","text":"
    when_not_matched_by_source_update(updates: Dict[str, str], predicate: Optional[str] = None) -> TableMerger\n

    Update a target row that has no matches in the source based on the rules defined by updates. If a predicate is specified, then it must evaluate to true for the row to be updated.

    Note

    Column names with special characters, such as numbers or spaces should be encapsulated in backticks: \"target.123column\" or \"target.my column\"

    Parameters:

    Name Type Description Default updates Dict[str, str]

    a mapping of column name to update SQL expression.

    required predicate Optional[str]

    SQL like predicate on when to update.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    Example
    from deltalake import DeltaTable, write_deltalake\nimport pyarrow as pa\n\ndata = pa.table({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\nwrite_deltalake(\"tmp\", data)\ndt = DeltaTable(\"tmp\")\nnew_data = pa.table({\"x\": [2, 3, 4]})\n\n(\n   dt.merge(\n       source=new_data,\n       predicate='target.x = source.x',\n       source_alias='source',\n       target_alias='target')\n   .when_not_matched_by_source_update(\n       predicate = \"y > 3\",\n       updates = {\"y\": \"0\"})\n   .execute()\n)\n{'num_source_rows': 3, 'num_target_rows_inserted': 0, 'num_target_rows_updated': 1, 'num_target_rows_deleted': 0, 'num_target_rows_copied': 2, 'num_output_rows': 3, 'num_target_files_added': 1, 'num_target_files_removed': 1, 'execution_time_ms': ..., 'scan_time_ms': ..., 'rewrite_time_ms': ...}\n\ndt.to_pandas().sort_values(\"x\", ignore_index=True)\n   x  y\n0  1  0\n1  2  5\n2  3  6\n
    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.when_not_matched_insert","title":"when_not_matched_insert","text":"
    when_not_matched_insert(updates: Dict[str, str], predicate: Optional[str] = None) -> TableMerger\n

    Insert a new row to the target table based on the rules defined by updates. If a predicate is specified, then it must evaluate to true for the new row to be inserted.

    Note

    Column names with special characters, such as numbers or spaces should be encapsulated in backticks: \"target.123column\" or \"target.my column\"

    Parameters:

    Name Type Description Default updates dict

    a mapping of column name to insert SQL expression.

    required predicate (str | None, Optional)

    SQL like predicate on when to insert.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    Example
    from deltalake import DeltaTable, write_deltalake\nimport pyarrow as pa\n\ndata = pa.table({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\nwrite_deltalake(\"tmp\", data)\ndt = DeltaTable(\"tmp\")\nnew_data = pa.table({\"x\": [4], \"y\": [7]})\n\n(\n    dt.merge(\n        source=new_data,\n        predicate=\"target.x = source.x\",\n        source_alias=\"source\",\n        target_alias=\"target\",)\n    .when_not_matched_insert(\n        updates={\n            \"x\": \"source.x\",\n            \"y\": \"source.y\",\n        })\n    .execute()\n)\n{'num_source_rows': 1, 'num_target_rows_inserted': 1, 'num_target_rows_updated': 0, 'num_target_rows_deleted': 0, 'num_target_rows_copied': 3, 'num_output_rows': 4, 'num_target_files_added': 1, 'num_target_files_removed': 1, 'execution_time_ms': ..., 'scan_time_ms': ..., 'rewrite_time_ms': ...}\n\ndt.to_pandas().sort_values(\"x\", ignore_index=True)\n   x  y\n0  1  4\n1  2  5\n2  3  6\n3  4  7\n
    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.when_not_matched_insert_all","title":"when_not_matched_insert_all","text":"
    when_not_matched_insert_all(predicate: Optional[str] = None) -> TableMerger\n

    Insert a new row to the target table, updating all source fields to target fields. Source and target are required to have the same field names. If a predicate is specified, then it must evaluate to true for the new row to be inserted.

    Note

    Column names with special characters, such as numbers or spaces should be encapsulated in backticks: \"target.123column\" or \"target.my column\"

    Parameters:

    Name Type Description Default predicate Optional[str]

    SQL like predicate on when to insert.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    Example
    from deltalake import DeltaTable, write_deltalake\nimport pyarrow as pa\n\ndata = pa.table({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\nwrite_deltalake(\"tmp\", data)\ndt = DeltaTable(\"tmp\")\nnew_data = pa.table({\"x\": [4], \"y\": [7]})\n\n(\n   dt.merge(\n       source=new_data,\n       predicate='target.x = source.x',\n       source_alias='source',\n       target_alias='target')\n   .when_not_matched_insert_all()\n   .execute()\n)\n{'num_source_rows': 1, 'num_target_rows_inserted': 1, 'num_target_rows_updated': 0, 'num_target_rows_deleted': 0, 'num_target_rows_copied': 3, 'num_output_rows': 4, 'num_target_files_added': 1, 'num_target_files_removed': 1, 'execution_time_ms': ..., 'scan_time_ms': ..., 'rewrite_time_ms': ...}\n\ndt.to_pandas().sort_values(\"x\", ignore_index=True)\n   x  y\n0  1  4\n1  2  5\n2  3  6\n3  4  7\n
    ","boost":2},{"location":"api/delta_table/delta_table_merger/#deltalake.table.TableMerger.with_writer_properties","title":"with_writer_properties","text":"
    with_writer_properties(data_page_size_limit: Optional[int] = None, dictionary_page_size_limit: Optional[int] = None, data_page_row_count_limit: Optional[int] = None, write_batch_size: Optional[int] = None, max_row_group_size: Optional[int] = None) -> TableMerger\n

    Deprecated

    Use .merge(writer_properties = WriterProperties()) instead

    Pass writer properties to the Rust parquet writer, see options https://arrow.apache.org/rust/parquet/file/properties/struct.WriterProperties.html:

    Parameters:

    Name Type Description Default data_page_size_limit Optional[int]

    Limit DataPage size to this in bytes.

    None dictionary_page_size_limit Optional[int]

    Limit the size of each DataPage to store dicts to this amount in bytes.

    None data_page_row_count_limit Optional[int]

    Limit the number of rows in each DataPage.

    None write_batch_size Optional[int]

    Splits internally to smaller batch size.

    None max_row_group_size Optional[int]

    Max number of rows in row group.

    None

    Returns:

    Name Type Description TableMerger TableMerger

    TableMerger Object

    ","boost":2},{"location":"api/delta_table/delta_table_optimizer/","title":"TableOptimizer","text":"","boost":10},{"location":"api/delta_table/delta_table_optimizer/#deltalake.table.TableOptimizer","title":"deltalake.table.TableOptimizer","text":"
    TableOptimizer(table: DeltaTable)\n

    API for various table optimization commands.

    ","boost":10},{"location":"api/delta_table/delta_table_optimizer/#deltalake.table.TableOptimizer.compact","title":"compact","text":"
    compact(partition_filters: Optional[FilterType] = None, target_size: Optional[int] = None, max_concurrent_tasks: Optional[int] = None, min_commit_interval: Optional[Union[int, timedelta]] = None, writer_properties: Optional[WriterProperties] = None, custom_metadata: Optional[Dict[str, str]] = None) -> Dict[str, Any]\n

    Compacts small files to reduce the total number of files in the table.

    This operation is idempotent; if run twice on the same table (assuming it has not been updated) it will do nothing the second time.

    If this operation happens concurrently with any operations other than append, it will fail.

    Parameters:

    Name Type Description Default partition_filters Optional[FilterType]

    the partition filters that will be used for getting the matched files

    None target_size Optional[int]

    desired file size after bin-packing files, in bytes. If not provided, will attempt to read the table configuration value delta.targetFileSize. If that value isn't set, will use default value of 256MB.

    None max_concurrent_tasks Optional[int]

    the maximum number of concurrent tasks to use for file compaction. Defaults to number of CPUs. More concurrent tasks can make compaction faster, but will also use more memory.

    None min_commit_interval Optional[Union[int, timedelta]]

    minimum interval in seconds or as timedeltas before a new commit is created. Interval is useful for long running executions. Set to 0 or timedelta(0), if you want a commit per partition.

    None writer_properties Optional[WriterProperties]

    Pass writer properties to the Rust parquet writer.

    None custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns:

    Type Description Dict[str, Any]

    the metrics from optimize

    Example

    Use a timedelta object to specify the seconds, minutes or hours of the interval.

    from deltalake import DeltaTable, write_deltalake\nfrom datetime import timedelta\nimport pyarrow as pa\n\nwrite_deltalake(\"tmp\", pa.table({\"x\": [1], \"y\": [4]}))\nwrite_deltalake(\"tmp\", pa.table({\"x\": [2], \"y\": [5]}), mode=\"append\")\n\ndt = DeltaTable(\"tmp\")\ntime_delta = timedelta(minutes=10)\ndt.optimize.compact(min_commit_interval=time_delta)\n{'numFilesAdded': 1, 'numFilesRemoved': 2, 'filesAdded': ..., 'filesRemoved': ..., 'partitionsOptimized': 1, 'numBatches': 2, 'totalConsideredFiles': 2, 'totalFilesSkipped': 0, 'preserveInsertionOrder': True}\n

    ","boost":10},{"location":"api/delta_table/delta_table_optimizer/#deltalake.table.TableOptimizer.z_order","title":"z_order","text":"
    z_order(columns: Iterable[str], partition_filters: Optional[FilterType] = None, target_size: Optional[int] = None, max_concurrent_tasks: Optional[int] = None, max_spill_size: int = 20 * 1024 * 1024 * 1024, min_commit_interval: Optional[Union[int, timedelta]] = None, writer_properties: Optional[WriterProperties] = None, custom_metadata: Optional[Dict[str, str]] = None) -> Dict[str, Any]\n

    Reorders the data using a Z-order curve to improve data skipping.

    This also performs compaction, so the same parameters as compact() apply.

    Parameters:

    Name Type Description Default columns Iterable[str]

    the columns to use for Z-ordering. There must be at least one column.

    required partition_filters Optional[FilterType]

    the partition filters that will be used for getting the matched files

    None target_size Optional[int]

    desired file size after bin-packing files, in bytes. If not provided, will attempt to read the table configuration value delta.targetFileSize. If that value isn't set, will use default value of 256MB.

    None max_concurrent_tasks Optional[int]

    the maximum number of concurrent tasks to use for file compaction. Defaults to number of CPUs. More concurrent tasks can make compaction faster, but will also use more memory.

    None max_spill_size int

    the maximum number of bytes to spill to disk. Defaults to 20GB.

    20 * 1024 * 1024 * 1024 min_commit_interval Optional[Union[int, timedelta]]

    minimum interval in seconds or as timedeltas before a new commit is created. Interval is useful for long running executions. Set to 0 or timedelta(0), if you want a commit per partition.

    None writer_properties Optional[WriterProperties]

    Pass writer properties to the Rust parquet writer.

    None custom_metadata Optional[Dict[str, str]]

    custom metadata that will be added to the transaction commit.

    None

    Returns:

    Type Description Dict[str, Any]

    the metrics from optimize

    Example

    Use a timedelta object to specify the seconds, minutes or hours of the interval.

    from deltalake import DeltaTable, write_deltalake\nfrom datetime import timedelta\nimport pyarrow as pa\n\nwrite_deltalake(\"tmp\", pa.table({\"x\": [1], \"y\": [4]}))\nwrite_deltalake(\"tmp\", pa.table({\"x\": [2], \"y\": [5]}), mode=\"append\")\n\ndt = DeltaTable(\"tmp\")\ntime_delta = timedelta(minutes=10)\ndt.optimize.z_order([\"x\"], min_commit_interval=time_delta)\n{'numFilesAdded': 1, 'numFilesRemoved': 2, 'filesAdded': ..., 'filesRemoved': ..., 'partitionsOptimized': 0, 'numBatches': 1, 'totalConsideredFiles': 2, 'totalFilesSkipped': 0, 'preserveInsertionOrder': True}\n

    ","boost":10},{"location":"api/delta_table/metadata/","title":"Metadata","text":"","boost":2},{"location":"api/delta_table/metadata/#deltalake.Metadata","title":"deltalake.Metadata dataclass","text":"
    Metadata(table: RawDeltaTable)\n

    Create a Metadata instance.

    ","boost":2},{"location":"api/delta_table/metadata/#deltalake.Metadata.configuration","title":"configuration property","text":"
    configuration: Dict[str, str]\n

    Return the DeltaTable properties.

    ","boost":2},{"location":"api/delta_table/metadata/#deltalake.Metadata.created_time","title":"created_time property","text":"
    created_time: int\n

    Return The time when this metadata action is created, in milliseconds since the Unix epoch of the DeltaTable.

    ","boost":2},{"location":"api/delta_table/metadata/#deltalake.Metadata.description","title":"description property","text":"
    description: str\n

    Return the user-provided description of the DeltaTable.

    ","boost":2},{"location":"api/delta_table/metadata/#deltalake.Metadata.id","title":"id property","text":"
    id: int\n

    Return the unique identifier of the DeltaTable.

    ","boost":2},{"location":"api/delta_table/metadata/#deltalake.Metadata.name","title":"name property","text":"
    name: str\n

    Return the user-provided identifier of the DeltaTable.

    ","boost":2},{"location":"api/delta_table/metadata/#deltalake.Metadata.partition_columns","title":"partition_columns property","text":"
    partition_columns: List[str]\n

    Return an array containing the names of the partitioned columns of the DeltaTable.

    ","boost":2},{"location":"how-delta-lake-works/architecture-of-delta-table/","title":"Architecture of a Delta Lake table","text":"

    A Delta table consists of Parquet files that contain data and a transaction log that stores metadata about the transactions.

    Let's create a Delta table, perform some operations, and inspect the files that are created.

    "},{"location":"how-delta-lake-works/architecture-of-delta-table/#delta-lake-transaction-examples","title":"Delta Lake transaction examples","text":"

    Start by creating a pandas DataFrame and writing it out to a Delta table.

    import pandas as pd\nfrom deltalake import DeltaTable, write_deltalake\n\ndf = pd.DataFrame({\"num\": [1, 2, 3], \"letter\": [\"a\", \"b\", \"c\"]})\nwrite_deltalake(\"tmp/some-table\", df)\n

    Now inspect the files created in storage:

    tmp/some-table\n\u251c\u2500\u2500 0-62dffa23-bbe1-4496-8fb5-bff6724dc677-0.parquet\n\u2514\u2500\u2500 _delta_log\n    \u2514\u2500\u2500 00000000000000000000.json\n

    The Parquet file stores the data that was written. The _delta_log directory stores metadata about the transactions. Let's inspect the _delta_log/00000000000000000000.json file.

    {\n  \"protocol\": {\n    \"minReaderVersion\": 1,\n    \"minWriterVersion\": 1\n  }\n}\n{\n  \"metaData\": {\n    \"id\": \"b96ea1a2-1830-4da2-8827-5334cc6104ed\",\n    \"name\": null,\n    \"description\": null,\n    \"format\": {\n      \"provider\": \"parquet\",\n      \"options\": {}\n    },\n    \"schemaString\": \"{\\\"type\\\":\\\"struct\\\",\\\"fields\\\":[{\\\"name\\\":\\\"num\\\",\\\"type\\\":\\\"long\\\",\\\"nullable\\\":true,\\\"metadata\\\":{}},{\\\"name\\\":\\\"letter\\\",\\\"type\\\":\\\"string\\\",\\\"nullable\\\":true,\\\"metadata\\\":{}}]}\",\n    \"partitionColumns\": [],\n    \"createdTime\": 1701740315599,\n    \"configuration\": {}\n  }\n}\n{\n  \"add\": {\n    \"path\": \"0-62dffa23-bbe1-4496-8fb5-bff6724dc677-0.parquet\",\n    \"size\": 2208,\n    \"partitionValues\": {},\n    \"modificationTime\": 1701740315597,\n    \"dataChange\": true,\n    \"stats\": \"{\\\"numRecords\\\": 3, \\\"minValues\\\": {\\\"num\\\": 1, \\\"letter\\\": \\\"a\\\"}, \\\"maxValues\\\": {\\\"num\\\": 3, \\\"letter\\\": \\\"c\\\"}, \\\"nullCount\\\": {\\\"num\\\": 0, \\\"letter\\\": 0}}\"\n  }\n}\n{\n  \"commitInfo\": {\n    \"timestamp\": 1701740315602,\n    \"operation\": \"CREATE TABLE\",\n    \"operationParameters\": {\n      \"location\": \"file:///Users/matthew.powers/Documents/code/delta/delta-examples/notebooks/python-deltalake/tmp/some-table\",\n      \"metadata\": \"{\\\"configuration\\\":{},\\\"created_time\\\":1701740315599,\\\"description\\\":null,\\\"format\\\":{\\\"options\\\":{},\\\"provider\\\":\\\"parquet\\\"},\\\"id\\\":\\\"b96ea1a2-1830-4da2-8827-5334cc6104ed\\\",\\\"name\\\":null,\\\"partition_columns\\\":[],\\\"schema\\\":{\\\"fields\\\":[{\\\"metadata\\\":{},\\\"name\\\":\\\"num\\\",\\\"nullable\\\":true,\\\"type\\\":\\\"long\\\"},{\\\"metadata\\\":{},\\\"name\\\":\\\"letter\\\",\\\"nullable\\\":true,\\\"type\\\":\\\"string\\\"}],\\\"type\\\":\\\"struct\\\"}}\",\n      \"protocol\": \"{\\\"minReaderVersion\\\":1,\\\"minWriterVersion\\\":1}\",\n      \"mode\": \"ErrorIfExists\"\n    },\n    \"clientVersion\": \"delta-rs.0.17.0\"\n  }\n}\n

    The transaction log file contains the following information:

    • the files added to the Delta table
    • schema of the files
    • column level metadata including the min/max value for each file

    Create another pandas DataFrame and append it to the Delta table to see how this transaction is recorded.

    df = pd.DataFrame({\"num\": [8, 9], \"letter\": [\"dd\", \"ee\"]})\nwrite_deltalake(f\"{cwd}/tmp/delta-table\", df, mode=\"append\")\n

    Here are the files in storage:

    tmp/some-table\n\u251c\u2500\u2500 0-62dffa23-bbe1-4496-8fb5-bff6724dc677-0.parquet\n\u251c\u2500\u2500 1-57abb6fb-2249-43ba-a7be-cf09bcc230de-0.parquet\n\u2514\u2500\u2500 _delta_log\n    \u251c\u2500\u2500 00000000000000000000.json\n    \u2514\u2500\u2500 00000000000000000001.json\n

    Here are the contents of the _delta_log/00000000000000000001.json file:

    {\n  \"add\": {\n    \"path\": \"1-57abb6fb-2249-43ba-a7be-cf09bcc230de-0.parquet\",\n    \"size\": 2204,\n    \"partitionValues\": {},\n    \"modificationTime\": 1701740386169,\n    \"dataChange\": true,\n    \"stats\": \"{\\\"numRecords\\\": 2, \\\"minValues\\\": {\\\"num\\\": 8, \\\"letter\\\": \\\"dd\\\"}, \\\"maxValues\\\": {\\\"num\\\": 9, \\\"letter\\\": \\\"ee\\\"}, \\\"nullCount\\\": {\\\"num\\\": 0, \\\"letter\\\": 0}}\"\n  }\n}\n{\n  \"commitInfo\": {\n    \"timestamp\": 1701740386169,\n    \"operation\": \"WRITE\",\n    \"operationParameters\": {\n      \"partitionBy\": \"[]\",\n      \"mode\": \"Append\"\n    },\n    \"clientVersion\": \"delta-rs.0.17.0\"\n  }\n}\n

    The transaction log records that the second file has been persisted in the Delta table.

    Now create a third pandas DataFrame and overwrite the Delta table with the new data.

    df = pd.DataFrame({\"num\": [11, 22], \"letter\": [\"aa\", \"bb\"]})\nwrite_deltalake(f\"{cwd}/tmp/delta-table\", df, mode=\"append\")\n

    Here are the files in storage:

    tmp/some-table\n\u251c\u2500\u2500 0-62dffa23-bbe1-4496-8fb5-bff6724dc677-0.parquet\n\u251c\u2500\u2500 1-57abb6fb-2249-43ba-a7be-cf09bcc230de-0.parquet\n\u251c\u2500\u2500 2-95ef2108-480c-4b89-96f0-ff9185dab9ad-0.parquet\n\u2514\u2500\u2500 _delta_log\n    \u251c\u2500\u2500 00000000000000000000.json\n    \u251c\u2500\u2500 00000000000000000001.json\n    \u2514\u2500\u2500 00000000000000000002.json\n

    Here are the contents of the _delta_log/0002.json file:

    {\n  \"add\": {\n    \"path\": \"2-95ef2108-480c-4b89-96f0-ff9185dab9ad-0.parquet\",\n    \"size\": 2204,\n    \"partitionValues\": {},\n    \"modificationTime\": 1701740465102,\n    \"dataChange\": true,\n    \"stats\": \"{\\\"numRecords\\\": 2, \\\"minValues\\\": {\\\"num\\\": 11, \\\"letter\\\": \\\"aa\\\"}, \\\"maxValues\\\": {\\\"num\\\": 22, \\\"letter\\\": \\\"bb\\\"}, \\\"nullCount\\\": {\\\"num\\\": 0, \\\"letter\\\": 0}}\"\n  }\n}\n{\n  \"remove\": {\n    \"path\": \"0-62dffa23-bbe1-4496-8fb5-bff6724dc677-0.parquet\",\n    \"deletionTimestamp\": 1701740465102,\n    \"dataChange\": true,\n    \"extendedFileMetadata\": false,\n    \"partitionValues\": {},\n    \"size\": 2208\n  }\n}\n{\n  \"remove\": {\n    \"path\": \"1-57abb6fb-2249-43ba-a7be-cf09bcc230de-0.parquet\",\n    \"deletionTimestamp\": 1701740465102,\n    \"dataChange\": true,\n    \"extendedFileMetadata\": false,\n    \"partitionValues\": {},\n    \"size\": 2204\n  }\n}\n{\n  \"commitInfo\": {\n    \"timestamp\": 1701740465102,\n    \"operation\": \"WRITE\",\n    \"operationParameters\": {\n      \"mode\": \"Overwrite\",\n      \"partitionBy\": \"[]\"\n    },\n    \"clientVersion\": \"delta-rs.0.17.0\"\n  }\n}\n

    This transaction adds a data file and marks the two exising data files for removal. Marking a file for removal in the transaction log is known as \"tombstoning the file\" or a \"logical delete\". This is different from a \"physical delete\" which actually removes the data file from storage.

    "},{"location":"how-delta-lake-works/architecture-of-delta-table/#how-delta-table-operations-differ-from-data-lakes","title":"How Delta table operations differ from data lakes","text":"

    Data lakes consist of data files persisted in storage. They don't have a transaction log that retain metadata about the transactions.

    Data lakes perform transactions differently than Delta tables.

    When you perform an overwrite tranasction with a Delta table, you logically delete the exiting data without physically removing it.

    Data lakes don't support logical deletes, so you have to physically delete the data from storage.

    Logical data operations are safer because they can be rolled back if they don't complete successfully. Physically removing data from storage can be dangerous, especially if it's before a transaction is complete.

    We're now ready to look into Delta Lake ACID transactions in more detail.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/","title":"Delta Lake Transactions","text":"

    This page teaches you about Delta Lake transactions and why transactions are important in production data settings. Data lakes don\u2019t support transactions and this is a huge downside because they offer a poor user experience, lack functionality, and can easily be corrupted.

    Transactions on Delta Lake tables are operations that change the state of table and record descriptive entries (metadata) of those changes to the Delta Lake transaction log. Here are some examples of transactions:

    • Deleting rows
    • Appending to the table
    • Compacting small files
    • Upserting
    • Overwriting rows

    All Delta Lake write operations are transactions in Delta tables. Reads actually aren\u2019t technically transactions because they don\u2019t result in new entries being appended to the transaction log.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#what-are-transactions","title":"What are transactions?","text":"

    Transactions are any Delta operation that change the underlying files of a Delta table and result in new entries metadata entries in the transaction log. Some Delta operations rearrange data in the existing table (like Z Ordering the table or compacting the small files) and these are also transactions. Let\u2019s look at a simple example.

    Suppose you have a Delta table with the following data:

    num animal\n1   cat\n2   dog\n3   snake\n

    Here\u2019s how to create this Delta table:

    import pandas as pd\nfrom deltalake import write_deltalake, DeltaTable\n\ndf = pd.DataFrame({\"num\": [1, 2, 3], \"animal\": [\"cat\", \"dog\", \"snake\"]})\nwrite_deltalake(\"tmp/my-delta-table\", df)\n

    Here are the files created in storage.

    tmp/my-delta-table\n\u251c\u2500\u2500 0-fea2de92-861a-423e-9708-a9e91dafb27b-0.parquet\n\u2514\u2500\u2500 _delta_log\n    \u2514\u2500\u2500 00000000000000000000.json\n

    Let\u2019s perform an operation to delete every animal from the Delta table that is a cat.

    dt = DeltaTable(\"tmp/my-delta-table\")\ndt.delete(\"animal = 'cat'\")\n

    Let\u2019s take a look at the contents of the Delta table now that the transaction is complete:

    tmp/my-delta-table\n\u251c\u2500\u2500 0-fea2de92-861a-423e-9708-a9e91dafb27b-0.parquet\n\u251c\u2500\u2500 _delta_log\n\u2502   \u251c\u2500\u2500 00000000000000000000.json\n\u2502   \u2514\u2500\u2500 00000000000000000001.json\n\u2514\u2500\u2500 part-00001-90312b96-b487-4a8f-9edc-1b9b3963f136-c000.snappy.parquet\n

    Notice the 00000000000000000001.json file that was added to the transaction log to record this transaction. Let\u2019s inspect the content of the file.

    {\n  \"add\": {\n    \"path\": \"part-00001-90312b96-b487-4a8f-9edc-1b9b3963f136-c000.snappy.parquet\",\n    \"partitionValues\": {},\n    \"size\": 858,\n    \"modificationTime\": 1705070631953,\n    \"dataChange\": true,\n    \"stats\": \"{\\\"numRecords\\\":2,\\\"minValues\\\":{\\\"num\\\":2,\\\"animal\\\":\\\"dog\\\"},\\\"maxValues\\\":{\\\"num\\\":3,\\\"animal\\\":\\\"snake\\\"},\\\"nullCount\\\":{\\\"num\\\":0,\\\"animal\\\":0}}\",\n    \"tags\": null,\n    \"deletionVector\": null,\n    \"baseRowId\": null,\n    \"defaultRowCommitVersion\": null,\n    \"clusteringProvider\": null\n  }\n}\n{\n  \"remove\": {\n    \"path\": \"0-fea2de92-861a-423e-9708-a9e91dafb27b-0.parquet\",\n    \"dataChange\": true,\n    \"deletionTimestamp\": 1705070631953,\n    \"extendedFileMetadata\": true,\n    \"partitionValues\": {},\n    \"size\": 895\n  }\n}\n{\n  \"commitInfo\": {\n    \"timestamp\": 1705070631953,\n    \"operation\": \"DELETE\",\n    \"operationParameters\": {\n      \"predicate\": \"animal = 'cat'\"\n    },\n    \"readVersion\": 0,\n    \"operationMetrics\": {\n      \"execution_time_ms\": 8013,\n      \"num_added_files\": 1,\n      \"num_copied_rows\": 2,\n      \"num_deleted_rows\": 1,\n      \"num_removed_files\": 1,\n      \"rewrite_time_ms\": 2,\n      \"scan_time_ms\": 5601\n    },\n    \"clientVersion\": \"delta-rs.0.17.0\"\n  }\n}\n

    We can see that this transaction includes two components:

    • Remove file 0-fea2de92-861a-423e-9708-a9e91dafb27b-0.parquet
    • Add file part-00001-90312b96-b487-4a8f-9edc-1b9b3963f136-c000.snappy.parquet

    Transactions are recorded in the transaction log. The transaction log is also referred to as the table metadata and is the _delta_log directory in storage.

    Let\u2019s see how Delta Lake implements transactions.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#how-delta-lake-implements-transactions","title":"How Delta Lake implements transactions","text":"

    Here is how Delta Lake implements transactions:

    1. Read the existing metadata
    2. Read the existing Parquet data files
    3. Write the Parquet files for the current transaction
    4. Record the new transaction in the transaction log (if there are no conflicts)

    Let\u2019s recall our delete operation from the prior section and see how it fits into this transaction model:

    1. We read the existing metadata to find the file paths for the existing Parquet files
    2. We read the existing Parquet files and identify the files that contains data that should be removed
    3. We write new Parquet files with the deleted data filtered out
    4. Once the new Parquet files are written, we check for conflicts and then make an entry in the transaction log. The next section will discuss transaction conflicts in more detail.

    Blind append operations can skip a few steps and are executed as follows:

    1. Write the Parquet files for the current transaction
    2. Record the new transaction in the metadata

    Delta implements a non locking MVCC (multi version concurrency control) so writers optimistically write new data and simply abandon the transaction if it conflicts at the end. The alternative would be getting a lock at the start thereby guaranteeing the transaction immediately.

    Let\u2019s look at the case when a Delta Lake transaction conflicts.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#how-delta-lake-transactions-can-conflict","title":"How Delta Lake transactions can conflict","text":"

    Suppose you have a transaction that deletes a row of data that\u2019s stored in FileA (Transaction 1). While this job is running, there is another transaction that deletes some other rows in FileA (Transaction 2). Transaction 1 finishes running first and is recorded in the metadata.

    Before Transaction 2 is recorded as a transaction, it will check the metadata, find that Transaction 2 conflicts with a transaction that was already recorded (from Transaction 1), and error without recording a new transaction.

    Transactions 2 will write Parquet data files, but will not be recorded as a transaction, so the data files will be ignored. The zombie Parquet files can be easily cleaned up via subsequent vacuum operations.

    Transaction 2 must fail otherwise it would cause the data to be incorrect.

    Delta Lake transactions prevent users from making changes that would corrupt the table. Transaction conflict behavior can differ based on isolation level, which controls the degree to which a transaction must be isolated from modifications made by other concurrent transactions. More about this in the concurrency section.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#transactions-rely-on-atomic-primitives-storage-guarantees","title":"Transactions rely on atomic primitives storage guarantees","text":"

    Suppose you have two transactions that are finishishing at the same exact time. Both of these transactions look at the existing Delta Lake transaction log, see that the latest transaction was 003.json and determine that the next entry should be 004.json.

    If both transactions are recorded in the 004.json file, then one of them will be clobbered, and the transaction log entry for the clobbered metadata entry will be lost.

    Delta tables rely on storage systems that provide atomic primitives for safe concurrency. The storage system must allow Delta Lake to write the file, only if it does not exist already, and error out otherwise. The storage system must NOT permit concurrent writers to overwrite existing metadata entries.

    Some clouds have filesystems that don\u2019t explicitly support these atomic primitives, and therefore must be coupled with other services to provide the necessary guarantees.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#delta-lake-transactions-are-only-for-a-single-table","title":"Delta Lake transactions are only for a single table","text":"

    Delta Lake transactions are only valid for a single table.

    Some databases offer transaction support for operations that impact multiple tables. Delta Lake does not support multi-table transactions.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#data-lakes-dont-support-transactions","title":"Data lakes don\u2019t support transactions","text":"

    Data lakes consist of many files in a storage system (e.g. a cloud storage system) and don\u2019t support transactions.

    Data lakes don\u2019t have a metadata layer, conflict resolution, or any way to store information about transactions.

    Data lakes are prone to multiple types of errors because they don\u2019t support transactions:

    • Easy to corrupt
    • Downtime/unstable state while jobs are running
    • Operations can conflict

    Data lakes have many downsides and it\u2019s almost always better to use a lakehouse storage system like Delta Lake compared to a data lake.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#acid-transactions","title":"ACID Transactions","text":"

    We\u2019ve already explored how Delta Lake supports transactions. This section explains how Delta Lake transactions have the Atomic, Consistent, Isolated and Durable (ACID transaction) properties. Reading this section is optional.

    ACID transactions are commonplace in databases but notably absent for data lakes.

    Delta Lake\u2019s ACID transaction support is one of the major reasons it is almost always a better option than a data lake.

    Let\u2019s explore how Delta Lake allows for ACID transactions.

    Atomic transactions

    An atomic transaction either fully completes or fully fails, with nothing in between.

    Delta Lake transactions are atomic, unlike data lake transactions that are not atomic.

    Suppose you have a job that\u2019s writing 100 files to a table. Further suppose that the job errors out and the cluster dies after writing 40 files:

    • For a Delta table, no additional data will be added to the table. Parquet files were written to the table, but the job errored, so no transaction log entry was added and no data was added to the table.
    • For a data lake, the 40 files are added and the transaction \u201cpartially succeeds\u201d.

    For data tables, it\u2019s almost always preferable to have a transaction that \u201cfully fails\u201d instead of one that \u201cpartially succeeds\u201d because partial writes are hard to unwind and debug.

    Delta Lake implements atomic transactions by writing data files first before making a new entry in the Delta transaction log.

    These guarantees are provided at the protocol level through the \"transaction\" abstraction. We\u2019ve already discussed what constitutes a transaction for Delta Lake.

    If there is an error with the transaction and some files don\u2019t get written, then no metadata entry is made and the partial data write is ignored. The zombie Parquet files can be easily cleaned up via subsequent vacuum operations.

    Now let\u2019s look at how Delta Lake also provides consistent transactions.

    Consistent transactions

    Consistency means that transactions won\u2019t violate integrity constraints on the Delta table.

    Delta Lake has two types of consistency checks:

    • Schema enforcement checks
    • Column constraints

    Schema enforcement checks verify that new data appended to a Delta table matches the schema of the existing table. You cannot append data with a different schema, unless you enable schema evolution.

    Delta Lake column constraints allow users to specify the requirements of data that\u2019s added to a Delta table. For example, if you have an age column with a constraint that requires the value to be positive, then Delta Lake will reject appends of any data that doesn\u2019t meet the constraint.

    Data lakes don\u2019t support schema enforcement or column constraints. That\u2019s another reason why data lakes are not ACID-compliant.

    Isolated transactions

    Isolation means that transactions are applied to a Delta table sequentially.

    Delta Lake transactions are persisted in monotonically increasing transaction files, as we saw in the previous example. First 00000000000000000000.json, then 00000000000000000001.json, then 00000000000000000002.json, and so on.

    Delta Lake uses concurrency control to ensure that transactions are executed sequentially, even when user operations are performed concurrently. The next page of this guide explains concurrency in Delta Lake in detail.

    Durable transactions

    Delta tables are generally persisted in cloud object stores which provide durability guarantees.

    Durability means that all transactions that are successfully completed will always remain persisted, even if there are service outages or program crashes.

    Suppose you have a Delta table that\u2019s persisted in Azure blob storage. The Delta table transactions that are committed will always remain available, even in these circumstances:

    • When there are Azure service outages
    • If a computation cluster that\u2019s writing the Delta table crashes for some reason
    • Two operations are running concurrently and one of them fails

    Successful transactions are always registered in the Delta table and persisted no matter what.

    "},{"location":"how-delta-lake-works/delta-lake-acid-transactions/#conclusion","title":"Conclusion","text":"

    Delta Lake supports transactions which provide necessary reliability guarantees for production data systems.

    Vanilla data lakes don\u2019t provide transactions and this can cause nasty bugs and a bad user experience. Let\u2019s look at a couple of scenarios when the lack of transactions cause a poor user experience:

    • While running a compaction operation on a data lake, newly compacted \u201cright sized\u201d files are added before the small files are deleted. If you read the data lake while this operation is running, you will see duplicate data.
    • While writing to a data lake, a job might fail, which leaves behind partially written files. These files are corrupt, which means that the data lake cannot be read until the corrupt files are manually removed.
    • Users want to run a simple DML operation like deleting a few rows of data which require a few files to be rewritten. This operation renders the data lake unusable until it\u2019s done running.

    Transactions are a key advantage of Delta Lake vs. data lakes. There are many other advantages, but proper transactions are necessary in production data environments.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/","title":"Delta Lake File Skipping","text":"

    Delta tables store file-level metadata information, which allows for a powerful optimization called file skipping.

    This page explains how Delta Lake implements file skipping, how to optimize your tables to maximize file skipping, and the benefits of file skipping.

    Let\u2019s start by looking at the file-level metadata in Delta tables.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/#delta-lake-file-metadata","title":"Delta Lake file metadata","text":"

    Delta Lake stores metadata about each file's min/max values in the table. Query engines can skip entire files when they don\u2019t contain data that\u2019s relevant to the query.

    Suppose you have a Delta table with data stored in two files and has the following metadata.

    filename    min_name    max_name    min_age max_age\nfileA       alice       joy         12      46  \nfileB       allan       linda       34      78\n

    Suppose you want to run the following query: select * from the_table where age &lt; 20.

    The engine only needs to read fileA to execute this query. fileB has a min_age of 34, so we know there aren\u2019t any rows of data with an age less than 20.

    The benefit of file skipping depends on the query and the data layout of the Delta table. Some queries cannot take advantage of any file skipping. Here\u2019s an example query that does not benefit from file skipping: select * from the_table group by age.

    Let\u2019s recreate this example with Polars to drive the point home.

    Start by writing out one file of data:

    import polars as pl\nfrom deltalake import DeltaTable\n\ndf = pl.DataFrame({\"name\": [\"alice\", \"cat\", \"joy\"], \"age\": [12, 35, 46]})\ndf.write_delta(\"tmp/a_table\")\n

    Now, write out another file of data:

    df = pl.DataFrame({\"name\": [\"allan\", \"brian\", \"linda\"], \"age\": [34, 35, 78]})\ndf.write_delta(\"tmp/a_table\", mode=\"append\")\n

    Here are the contents of the Delta table:

    tmp/a_table\n\u251c\u2500\u2500 0-7d414a88-a634-4c2f-9c5b-c29b6ee5f524-0.parquet\n\u251c\u2500\u2500 1-0617ef60-b17b-46a5-9b0f-c7dda1b73eee-0.parquet\n\u2514\u2500\u2500 _delta_log\n    \u251c\u2500\u2500 00000000000000000000.json\n    \u2514\u2500\u2500 00000000000000000001.json\n

    Now run a query to fetch all the records where the age is less than 20:

    pl.scan_delta(\"tmp/a_table\").filter(pl.col(\"age\") < 20).collect()\n
    +-------+-----+\n| name  | age |\n| ---   | --- |\n| str   | i64 |\n+=============+\n| alice | 12  |\n+-------+-----+\n

    Polars can use the Delta table metadata to skip the file that does not contain data relevant to the query.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/#how-delta-lake-implements-file-skipping","title":"How Delta Lake implements file skipping","text":"

    Here\u2019s how engines execute queries on Delta tables:

    • Start by reading the transaction log to get the file paths, file sizes, and min/max value for each column
    • Parse the query and push down the predicates to skip files
    • Read the minimal subset of the files needed for the query

    Some file formats don\u2019t allow for file skipping. For example, CSV files don\u2019t have file-level metadata, so query engines can\u2019t read a minimal subset of the data. The query engine has to check all the files, even if they don\u2019t contain any relevant data.

    When data is in Parquet files, the query engine can open up all the files, read the footers, build the file-level metadata, and perform file skipping. Fetching metadata in each file is slower than grabbing the pre-built file-level metadata from the transaction log.

    Now, let\u2019s see how to structure your tables to allow for more file skipping.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/#file-skipping-for-different-file-sizes","title":"File skipping for different file sizes","text":"

    Delta tables store data in files. Smaller files allow for more file skipping compared to bigger files.

    However, an excessive number of small files isn\u2019t good because it creates I/O overhead and slows down queries.

    Your Delta tables should have files that are \u201cright-sized\u201d. For a table with 150 GB of data, 5 GB files would probably be too large, and 10 KB files would be too small. It\u2019s generally best to store data in files that are between 100 MB and 1 GB.

    Delta Lake has an optimize function that performs small file compaction, so you don\u2019t need to program this logic yourself.

    Now, let's investigate how to store data in files to maximize the file skipping opportunities.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/#how-to-maximize-file-skipping","title":"How to maximize file skipping","text":"

    You can maximize file-skipping by colocating similar data in the same files.

    Suppose you have a table with test scores and frequently run queries that filter based on the test_score column.

    filename    min_test_score  max_test_score\nfileA       45              100\nfileB       65              98\nfileC       50              96\n

    Suppose you want to run the following query: select * from exams where test_score > 90.

    This query cannot skip files, given the current organization of the data. You can rearrange the data to colocate similar test scores in the same files to allow for file skipping. Here\u2019s the new layout:

    filename    min_test_score  max_test_score\nfileD       45              70\nfileE       55              80\nfileF       78              100\n

    The query (select * from exams where test_score > 90) can skip two of the three files with the new Delta table layout. The query engine only has to read fileF for this query.

    Now, let\u2019s look at how file skipping works with string values.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/#how-file-skipping-works-with-strings","title":"How file skipping works with strings","text":"

    File skipping is also effective when filtering on string values.

    Suppose you have a table with person_name and country columns. There are millions of rows of data. Here are the first three rows of data:

    person_name country\nperson1     angola\nperson2     china\nperson3     mexico\n

    The Delta table contains three files with the following metadata:

    filename    min_country max_country\nfileA       albania     mali\nfileB       libia       paraguay\nfileC       oman        zimbabwe\n

    Suppose you want to run the following query: select * from some_people where country = 'austria'.

    You only need to read the data in fileA to run this query. The min_country value for fileB and fileC are greater than \u201caustria\u201d, so we know those files don\u2019t contain any data relevant to the query.

    File skipping can also be a robust optimization for string values. Now, let\u2019s see how file skipping works for partitioned tables.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/#file-skipping-for-partitioned-tables","title":"File skipping for partitioned tables","text":"

    You can partition Delta tables for file skipping as well. Suppose we have the same data as in the previous section, but the table is partitioned by country.

    Here\u2019s the Delta table:

    filename    partition\nfileA       albania\nfileB       libia\nfileC       oman\nfileD       jamaica\nfileE       albania\nfileF       oman\n

    Suppose you want to run the following query on this partitioned table: select * from some_partitioned_table where country = 'albania'.

    You only need to read fileA and fileE to execute this query. Delta Lake provides the file-level partition metadata in the transaction log so that this query will run quickly.

    "},{"location":"how-delta-lake-works/delta-lake-file-skipping/#conclusion","title":"Conclusion","text":"

    Delta Lake allows for file skipping, which is a powerful performance optimization.

    Delta Lake also provides built-in utilities to colocate data in the same files like partitioning, Z Ordering, and compaction to improve file skipping.

    Delta Lake users need to know how to assess the tradeoffs of these techniques to optimize file skipping. Users also need to understand the most frequent query patterns of their tables to best allow for file maximal file skipping.

    "},{"location":"integrations/delta-lake-arrow/","title":"Delta Lake Arrow Integrations","text":"

    Delta Lake tables can be exposed as Arrow tables and Arrow datasets, which allows for interoperability with a variety of query engines.

    This page shows you how to convert Delta tables to Arrow data structures and teaches you the difference between Arrow tables and Arrow datasets. Tables are \"eager\" and datasets are \"lazy\", which has important performance implications, keep reading to learn more!

    "},{"location":"integrations/delta-lake-arrow/#delta-lake-to-arrow-dataset","title":"Delta Lake to Arrow Dataset","text":"

    Delta tables can easily be exposed as Arrow datasets. This makes it easy for any query engine that can read Arrow datasets to read a Delta table.

    Let's take a look at the h2o groupby dataset that contains 9 columns of data. Here are three representative rows of data:

    +-------+-------+--------------+-------+-------+--------+------+------+---------+\n| id1   | id2   | id3          |   id4 |   id5 |    id6 |   v1 |   v2 |      v3 |\n|-------+-------+--------------+-------+-------+--------+------+------+---------|\n| id016 | id046 | id0000109363 |    88 |    13 | 146094 |    4 |    6 | 18.8377 |\n| id039 | id087 | id0000466766 |    14 |    30 | 111330 |    4 |   14 | 46.7973 |\n| id047 | id098 | id0000307804 |    85 |    23 | 187639 |    3 |    5 | 47.5773 |\n+-------+-------+--------------+-------+-------+--------+------+------+---------+\n

    Here's how to expose the Delta table as a PyArrow dataset and run a query with DuckDB:

    import duckdb\nfrom deltalake import DeltaTable\n\ntable = DeltaTable(\"delta/G1_1e9_1e2_0_0\")\ndataset = table.to_pyarrow_dataset()\nquack = duckdb.arrow(dataset)\nquack.filter(\"id1 = 'id016' and v2 > 10\")\n

    Here's the result:

    \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502   id1   \u2502   id2   \u2502     id3      \u2502  id4  \u2502  id5  \u2502   id6   \u2502  v1   \u2502  v2   \u2502    v3     \u2502\n\u2502 varchar \u2502 varchar \u2502   varchar    \u2502 int32 \u2502 int32 \u2502  int32  \u2502 int32 \u2502 int32 \u2502  double   \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 id016   \u2502 id054   \u2502 id0002309114 \u2502    62 \u2502    95 \u2502 7180859 \u2502     4 \u2502    13 \u2502  7.750173 \u2502\n\u2502 id016   \u2502 id044   \u2502 id0003968533 \u2502    63 \u2502    98 \u2502 2356363 \u2502     4 \u2502    14 \u2502  3.942417 \u2502\n\u2502 id016   \u2502 id034   \u2502 id0001082839 \u2502    58 \u2502    73 \u2502 8039808 \u2502     5 \u2502    12 \u2502 76.820135 \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 ? rows (>9999 rows, 3 shown)                                                 9 columns \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n

    Arrow datasets allow for the predicates to get pushed down to the query engine, so the query is executed quickly.

    "},{"location":"integrations/delta-lake-arrow/#delta-lake-to-arrow-table","title":"Delta Lake to Arrow Table","text":"

    You can also run the same query with DuckDB on an Arrow table:

    quack = duckdb.arrow(table.to_pyarrow_table())\nquack.filter(\"id1 = 'id016' and v2 > 10\")\n

    This returns the same result, but it runs slower.

    "},{"location":"integrations/delta-lake-arrow/#difference-between-arrow-dataset-and-arrow-table","title":"Difference between Arrow Dataset and Arrow Table","text":"

    Arrow Datasets are lazy and allow for full predicate pushdown unlike Arrow tables which are eagerly loaded into memory.

    The previous DuckDB queries were run on a 1 billion row dataset that's roughly 50 GB when stored as an uncompressed CSV file. Here are the runtimes when the data is stored in a Delta table and the queries are executed on a 2021 Macbook M1 with 64 GB of RAM:

    • Arrow table: 17.1 seconds
    • Arrow dataset: 0.01 seconds

    The query runs much faster on an Arrow dataset because the predicates can be pushed down to the query engine and lots of data can be skipped.

    Arrow tables are eagerly materialized in memory and don't allow for the same amount of data skipping.

    "},{"location":"integrations/delta-lake-arrow/#multiple-query-engines-can-query-arrow-datasets","title":"Multiple query engines can query Arrow Datasets","text":"

    Other query engines like DataFusion can also query Arrow datasets, see the following example:

    from datafusion import SessionContext\n\nctx = SessionContext()\nctx.register_dataset(\"my_dataset\", table.to_pyarrow_dataset())\nctx.sql(\"select * from my_dataset where v2 > 5\")\n

    Here's the result:

    +-------+-------+--------------+-----+-----+--------+----+----+-----------+\n| id1   | id2   | id3          | id4 | id5 | id6    | v1 | v2 | v3        |\n+-------+-------+--------------+-----+-----+--------+----+----+-----------+\n| id082 | id049 | id0000022715 | 97  | 55  | 756924 | 2  | 11 | 74.161136 |\n| id053 | id052 | id0000113549 | 19  | 56  | 139048 | 1  | 10 | 95.178444 |\n| id090 | id043 | id0000637409 | 94  | 50  | 12448  | 3  | 12 | 60.21896  |\n+-------+-------+--------------+-----+-----+--------+----+----+-----------+\n

    Any query engine that's capable of reading an Arrow table/dataset can read a Delta table.

    "},{"location":"integrations/delta-lake-arrow/#conclusion","title":"Conclusion","text":"

    Delta tables can easily be exposed as Arrow tables/datasets.

    Therefore any query engine that can read an Arrow table/dataset can also read a Delta table.

    Arrow datasets allow for more predicates to be pushed down to the query engine, so they can perform better performance than Arrow tables.

    "},{"location":"integrations/delta-lake-daft/","title":"Using Delta Lake with Daft","text":"

    Daft is a framework for ETL, analytics, and ML/AI at scale with a familiar Python dataframe API, implemented in Rust.

    Daft and Delta Lake work really well together. Daft provides unified compute for Delta Lake\u2019s unified storage. Together, Delta Lake and Daft give you high-performance query optimization and distributed compute on massive datasets.

    Delta Lake is a great storage format for Daft workloads. Delta gives Daft users:

    • Query optimization via file-skipping and column pruning
    • Versioning for easy time travel functionality
    • Faster reads via Z-ordering
    • ACID transactions and schema enforcement for more reliable reads and writes

    For Delta Lake users, Daft is a great data processing tool because it gives you the following features:

    • Multimodal Dataframes: read, write and transform multimodal data incl. images, JSON, PDF, audio, etc.
    • Parallel + Distributed Reads: Daft parallelizes Delta Lake table reads over all cores of your machine, if using the default multithreading runner, or all cores + machines of your Ray cluster, if using the distributed Ray runner.
    • Skipping Filtered Data: Daft implements automatic partition pruning and stats-based file pruning for filter predicates, skipping data that doesn\u2019t need to be read.

    Let's look at how to use Delta Lake with Daft.

    "},{"location":"integrations/delta-lake-daft/#installing-daft-for-delta-lake","title":"Installing Daft for Delta Lake","text":"

    The easiest way to use the Delta Lake table format with Daft DataFrames is to install Daft with the [deltalake] extras using pip:

    !pip install -U \"getdaft[deltalake]\"\n

    This adds the deltalake Python package to your install. This package is used to fetch metadata about the Delta Lake table, such as paths to the underlying Parquet files and table statistics. You can of course also install the deltalake manually.

    "},{"location":"integrations/delta-lake-daft/#read-delta-lake-into-a-daft-dataframe","title":"Read Delta Lake into a Daft DataFrame","text":"

    You can easily read Delta Lake tables into a Daft DataFrame using the read_delta_lake method. Let's use it to read some data stored in a Delta Lake on disk. You can access the data stored as a Delta Lake on Github

    import daft\n\n# read delta table into Daft DataFrame\ndf = daft.read_delta_lake(\"path/to/delta_table\")\n

    You can also read in Delta Lake data from remote sources like S3:

    # table_uri = (\n#     \"s3://daft-public-datasets/red-pajamas/\"\n#     \"stackexchange-sample-north-germanic-deltalake\"\n# )\n# df = daft.read_delta_lake(table_uri)\n
    df\n
    first_nameUtf8last_nameUtf8countryUtf8continentUtf8 (No data to display: Dataframe not materialized)

    Daft DataFrames are lazy by default. This means that the contents will not be computed (\"materialized\") unless you explicitly tell Daft to do so. This is best practice for working with larger-than-memory datasets and parallel/distributed architectures.

    The Delta table we have just loaded only has 5 rows. You can materialize it in memory using the .collect method:

    > df.collect()\n\n|    | first_name   | last_name   | country   | continent   |\n|---:|:-------------|:------------|:----------|:------------|\n|  0 | Ernesto      | Guevara     | Argentina | NaN         |\n|  1 | Bruce        | Lee         | China     | Asia        |\n|  2 | Jack         | Ma          | China     | Asia        |\n|  3 | Wolfgang     | Manche      | Germany   | NaN         |\n|  4 | Soraya       | Jala        | Germany   | NaN         |\n
    "},{"location":"integrations/delta-lake-daft/#write-to-delta-lake","title":"Write to Delta Lake","text":"

    You can use write_deltalake to write a Daft DataFrame to a Delta table:

    df.write_deltalake(\"tmp/daft-table\", mode=\"overwrite\")\n

    Daft supports multiple write modes. See the Daft documentation for more information.

    "},{"location":"integrations/delta-lake-daft/#what-can-i-do-with-a-daft-dataframe","title":"What can I do with a Daft DataFrame?","text":"

    Daft gives you full-featured DataFrame functionality, similar to what you might be used to from pandas, Dask or PySpark.

    On top of this, Daft also gives you:

    • Multimodal data type support to work with Images, URLs, Tensors and more
    • Expressions API for easy column transformations
    • UDFs for multi-column transformation, incl. ML applications

    Check out the Daft User Guide for a complete list of DataFrame operations.

    "},{"location":"integrations/delta-lake-daft/#data-skipping-optimizations","title":"Data Skipping Optimizations","text":"

    Delta Lake and Daft work together to give you highly-optimized query performance.

    Delta Lake stores your data in Parquet files. Parquet is a columnar row format that natively supports column pruning. If your query only needs to read data from a specific column or set of columns, you don't need to read in the entire file. This can save you lots of time and compute.

    Delta Lake goes beyond the basic Parquet features by also giving you:

    • partitioned reads
    • file skipping via z-ordering.

    This is great for Daft users who want to run efficient queries on large-scale data.

    Let's look at how this works.

    "},{"location":"integrations/delta-lake-daft/#partitioned-reads","title":"Partitioned Reads","text":"

    You may have noticed the Delta Lake warning at the top when we first called collect() on our DataFrame:

    WARNING: has partitioning keys = [PartitionField(country#Utf8)], but no partition filter was specified. This will result in a full table scan.

    Delta Lake is informing us that the data is partitioned on the country column.

    Daft does some nice magic here to help you out. The Daft query optimizer has access to all of the Delta Lake metadata. This means it can optimize your query by skipping the partitions that are not relevant for this query. Instead of having to read all 3 partitions, we can read only 1 and get the same result, just faster!

    # Filter on partition columns will result in efficient partition pruning; non-matching partitions will be skipped.\n> df.where(df[\"country\"] == \"Germany\").show()\n\n|    | first_name   | last_name   | country   |   continent |\n|---:|:-------------|:------------|:----------|------------:|\n|  0 | Wolfgang     | Manche      | Germany   |         nan |\n|  1 | Soraya       | Jala        | Germany   |         nan |\n

    You can use the explain() method to see how Daft is optimizing your query.

    Since we've already called collect on our DataFrame, it is already in memory. So below we copy the output of explain(show_all=True) before calling collect:

    Running df.where(df[\"continent\"] == \"Asia\").explain(True) returns:

    (...)\n\n== Optimized Logical Plan ==\n\n* PythonScanOperator: DeltaLakeScanOperator(None)\n|   File schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8\n|   Partitioning keys = [PartitionField(country#Utf8)]\n|   Filter pushdown = col(continent) == lit(\"Asia\")\n|   Output schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8\n\n\n== Physical Plan ==\n\n* TabularScan:\n|   Num Scan Tasks = 3\n|   Estimated Scan Bytes = 3045\n|   Clustering spec = { Num partitions = 3 }\n

    Whereas running df.where(df[\"country\"] == \"Germany\").explain(True) returns:

    (...)\n\n== Optimized Logical Plan ==\n\n* PythonScanOperator: DeltaLakeScanOperator(None)\n|   File schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8\n|   Partitioning keys = [PartitionField(country#Utf8)]\n|   Partition Filter = col(country) == lit(\"Germany\")\n|   Output schema = first_name#Utf8, last_name#Utf8, country#Utf8, continent#Utf8\n\n\n== Physical Plan ==\n\n* TabularScan:\n|   Num Scan Tasks = 1\n|   Estimated Scan Bytes = 1025\n|   Clustering spec = { Num partitions = 1 }\n

    Running a query on a non-partitioned column like continent will require reading in all partitions, totalling 3045 bytes in the case of this toy example.

    Instead, running a query on a partitioned column (country in this case) means Daft only has to read only the relevant partition, saving us a ~60% of the compute. This has huge impacts when you're working at scale.

    "},{"location":"integrations/delta-lake-daft/#z-ordering-for-enhanced-file-skipping","title":"Z-Ordering for enhanced file skipping","text":"

    Z-ordering stores similar data close together to optimize query performance. This is especially useful when you're querying on one or multiple columns.

    Using Z-Ordered Delta tables instead of regular Parquet can give Daft users significant speed-ups.

    Read High-Performance Querying on Massive Delta Lake Tables with Daft for an in-depth benchmarking of query optimization with Delta Lake and Daft using partitioning and Z-ordering.

    "},{"location":"integrations/delta-lake-daft/#daft-gives-you-multimodal-data-type-support","title":"Daft gives you Multimodal Data Type Support","text":"

    Daft has a rich multimodal type-system with support for Python objects, Images, URLs, Tensors and more.

    The Expressions API provides useful tools to work with these data types. By combining multimodal data support with the User-Defined Functions API you can run ML workloads right within your DataFrame.

    Take a look at the notebook in the delta-examples Github repository for a closer look at how Daft handles URLs, images and ML applications.

    "},{"location":"integrations/delta-lake-daft/#contribute-to-daft","title":"Contribute to daft","text":"

    Excited about Daft and want to contribute? Join them on Github \ud83d\ude80

    Like many technologies, Daft collects some non-identifiable telemetry to improve the product. This is stricly non-identifiable metadata. You can disable telemetry by setting the following environment variable: DAFT_ANALYTICS_ENABLED=0. Read more in the Daft documentation.

    "},{"location":"integrations/delta-lake-dagster/","title":"Using Delta Lake with Dagster\u00b6","text":"

    Delta Lake is a great storage format for Dagster workflows. This page will explain why and how to use Delta Lake with Dagster.

    You will learn how to use the Delta Lake I/O Manager to read and write your Dagster Software-Defined Assets (SDAs). You will also learn about the unique advantages Delta Lake offers the Dagster community.

    Here are some of the benefits that Delta Lake provides Dagster users: - native PyArrow integration for lazy computation of large datasets, - more efficient querying with file skipping via Z Ordering and liquid clustering - built-in vacuuming to remove unnecessary files and versions - ACID transactions for reliable writes - smooth versioning integration so that versions can be use to trigger downstream updates. - surfacing table stats based on the file statistics

    "},{"location":"integrations/delta-lake-dagster/#dagster-io-managers","title":"Dagster I/O Managers","text":"

    Dagster uses I/O Managers to simplify data reads and writes. I/O Managers help you reduce boilerplate code by storing Dagster Asset and Op outputs and loading them as inputs to downstream objects. They make it easy to change where and how your data is stored.

    You only need to define your I/O Manager and its settings (such as storage location and schema) once and the I/O Manager will take care of correctly reading and writing all your Dagster Assets automatically.

    If you need lower-level access than the Dagster I/O Managers provide, take a look at the Delta Table Resource.

    "},{"location":"integrations/delta-lake-dagster/#the-delta-lake-io-manager","title":"The Delta Lake I/O Manager","text":"

    You can easily read and write Delta Lake Tables from Dagster by using the DeltaLakeIOManager().

    Install the DeltaLakeIOManager:

    pip install dagster-deltalake\n

    Next, configure the following settings in your project\u2019s __init__.py file: - io_manager: set this to DeltaLakeIOManager(), this sets the default I/O Manager for all your Assets

    Within the DeltaLakeIOManager, define: - root_uri: the root path where your Delta Tables will be created - storage_options: configuration for accessing storage location - schema: name of schema to use (optional, defaults to public)

    defs = Definitions(\n   assets=all_assets,\n   resources={\n        \"io_manager\": DeltaLakePyarrowIOManager(\n            root_uri=\"path/to/deltalake\",\n            storage_options=LocalConfig(),\n            schema=\"dagster_deltalake\",\n        ),\n   },\n)\n

    Now, when you materialize an Asset, it will be saved as a Delta Lake in a folder dagster_deltalake/asset_name under the root directory path/to/deltalake.

    The default Delta Lake I/O Manager supports Arrow reads and writes. You can also use the Delta Lake I/O Manager with pandas or polars.

    "},{"location":"integrations/delta-lake-dagster/#creating-delta-lake-tables-with-dagster","title":"Creating Delta Lake Tables with Dagster","text":"

    You don\u2019t need to do anything else to store your Dagster Assets as Delta Lake tables. The I/O Manager will handle storing and loading your Assets as Delta Lake tables from now on.

    You can proceed to write Dagster code as you normally would. For example, you can create an Asset that reads in some toy data about animals and writes it out to an Arrow Table:

    import pyarrow as pa\nfrom pyarrow import csv\n\nfrom dagster import asset\n\n@asset\ndef raw_dataset() -> pa.Table:\n   n_legs = pa.array([2, 4, None, 100])\n   animals = pa.array([\"Flamingo\", \"Horse\", \"Brittle stars\", \"Centipede\"])\n   data = {'n_legs': n_legs, 'animals': animals}\n\n   return pa.Table.from_pydict(data)\n

    When you materialize the Asset defined above (using the config settings defined earlier), the Delta Lake I/O Manager will create the table dagster_deltalake/iris_dataset if it doesn\u2019t exist yet.

    "},{"location":"integrations/delta-lake-dagster/#overwrites-when-rematerializing-assets","title":"Overwrites when Rematerializing Assets","text":"

    If the table does already exist at the specified location, the Delta Lake I/O Manager will perform an overwrite. Delta Lake\u2019s transaction log maintains a record of all changes to your Delta Lake tables. You can inspect the record of changes to your Delta Lake tables by taking a look at these transaction logs.

    "},{"location":"integrations/delta-lake-dagster/#loading-delta-lake-tables-in-downstream-assets","title":"Loading Delta Lake Tables in Downstream Assets","text":"

    You can use Assets stored as Delta Lake tables as input to downstream Assets. Dagster and the Delta Lake I/O Manager make this easy for you.

    You can write Dagster code as you normally would. Pass the upstream Asset as an argument to the downstream object to set up the dependency. Make sure to define the correct data type.

    The Delta Lake I/O Manager will handle reading and writing the data from your Delta Lake.

    import pyarrow as pa\nfrom dagster import asset\n\n# ... raw_dataset asset is defined here ...\n\n@asset\ndef clean_dataset(raw_dataset: pa.Table) -> pa.Table:\n   return raw_dataset.drop_null()\n
    "},{"location":"integrations/delta-lake-dagster/#reading-existing-delta-lake-tables-into-dagster","title":"Reading Existing Delta Lake Tables into Dagster","text":"

    You can make existing Delta Lake tables (that were not created in Dagster) available to your Dagster assets. Use the SourceAsset object and pass the table name as the key argument:

    from dagster import SourceAsset\n\niris_harvest_data = SourceAsset(key=\"more_animal_data\")\n

    This will load a table more_animal_data located at <root_uri>/<schema> as configured in the Definitions object above (see Delta Lake I/O Manager section).

    "},{"location":"integrations/delta-lake-dagster/#column-pruning","title":"Column Pruning","text":"

    You can often improve the efficiency of your computations by only loading specific columns of your Delta table. This is called column pruning.

    With the Delta Lake I/O manager, you can select specific columns to load defining the columns in the metadata parameter of the AssetIn that loads the upstream Asset:

    import pyarrow as pa\nfrom dagster import AssetIn, asset\n\n# this example uses the clean_dataset Asset defined earlier\n\n@asset(\n       ins={\n           \"mammal_bool\": AssetIn(\n               key=\"clean_dataset\",\n               metadata={\"columns\": [\"is_mammal\", \"animals\"]},\n           )\n       }\n)\ndef mammal_data(mammal_bool: pa.Table) -> pa.Table:\n   mammals = mammal_bool[\"is_mammal\"].cast(\"bool\")\n   animals = mammal_bool[\"animals\"]\n   data = {\"mammal_bool\": mammals, \"animals\": animals}\n   return pa.Table.from_pydict(data)\n

    Here, we select only the sepal_length_cm and sepal_width_cm columns from the iris_dataset table and load them into an AssetIn object called iris_sepal. This AssetIn object is used to create a new Asset sepal_data, containing only the selected columns.

    "},{"location":"integrations/delta-lake-dagster/#working-with-partitioned-assets","title":"Working with Partitioned Assets","text":"

    Partitioning is an important feature of Delta Lake that can make your computations more efficient. The Delta Lake I/O manager helps you read and write partitioned data easily. You can work with static partitions, time-based partitions, multi-partitions, and dynamic partitions.

    For example, you can partition the Iris dataset on the species column as follows:

    import pyarrow as pa\n\nfrom dagster import StaticPartitionsDefinition, asset\n\n@asset(\n  partitions_def=StaticPartitionsDefinition(\n      [\"Human\", \"Horse\",]\n  ),\n  metadata={\"partition_expr\": \"n_legs\"},\n)\ndef dataset_partitioned(\n   context,\n   clean_dataset: pa.Table,\n   ) -> pa.Table:\n   animals = context.asset_partition_key_for_output()\n   table = clean_dataset\n\n   return table.filter(pc.field(\"animals\") == animals)\n

    To partition your data, make sure to include the relevant partitions_def and metadata arguments to the @asset decorator. Refer to the Dagster documentation on partitioning assets for more information.

    "},{"location":"integrations/delta-lake-dagster/#using-delta-lake-and-dagster-with-pandas","title":"Using Delta Lake and Dagster with Pandas","text":"

    To read and write data to Delta Lake using pandas, use the DeltaLakePandasIOManager().

    You will need to install it using:

    pip install dagster-deltalake-pandas\n

    In your Definitions object, change the io_manager to DeltaLakePandasIOManager():

    from dagster_deltalake_pandas import DeltaLakePandasIOManager\n\n\ndefs = Definitions(\n   assets=all_assets,\n   resources={\n        \"io_manager\": DeltaLakePandasIOManager(\n            root_uri=\"path/to/deltalake\",\n            storage_options=LocalConfig(),\n            schema=\"dagster_deltalake\",\n        ),\n   },\n)\n

    Now you can read and write Dagster Assets defined as pandas DataFrames in Delta Lake format. For example:

    import pandas as pd\nfrom dagster import asset\n\n@asset\ndef iris_dataset() -> pd.DataFrame:\n   return pd.read_csv(\n       \"https://docs.dagster.io/assets/iris.csv\",\n       names=[\n           \"sepal_length_cm\",\n           \"sepal_width_cm\",\n           \"petal_length_cm\",\n           \"petal_width_cm\",\n           \"species\",\n       ],\n   )\n
    "},{"location":"integrations/delta-lake-dagster/#using-delta-lake-and-dagster-with-polars","title":"Using Delta Lake and Dagster with Polars","text":"

    To read and write data to Delta Lake using pandas, use the DeltaLakePolarsIOManager().

    You will need to install it using:

    pip install dagster-deltalake-polars\n

    In your Definitions object, change the io_manager to DeltaLakePolarsIOManager():

    from dagster_polars import DeltaLakePolarsIOManager\n\ndefs = Definitions(\n   assets=all_assets,\n   resources={\n        \"io_manager\": DeltaLakePolarsIOManager(\n            root_uri=\"path/to/deltalake\",\n            storage_options=LocalConfig(),\n            schema=\"dagster_deltalake\",\n        ),\n   },\n)\n

    Now you can read and write Dagster Assets defined as Polars DataFrames in Delta Lake format. For example:

    import polars as pl\nfrom dagster import asset\n\n\n@asset\ndef iris_dataset() -> pl.DataFrame:\n   return pl.read_csv(\n       \"https://docs.dagster.io/assets/iris.csv\",\n       new_columns=[\n          \"sepal_length_cm\",\n          \"sepal_width_cm\",\n          \"petal_length_cm\",\n          \"petal_width_cm\",\n          \"species\",\n      ],\n   has_header=False\n)\n
    "},{"location":"integrations/delta-lake-dagster/#delta-lake-table-resource","title":"Delta Lake Table Resource","text":"

    I/O managers are a helpful tool in many common usage situations. But when you need lower-level access, the I/O Manager might not be the right tool to use. In these cases you may want to use the Delta Lake Table Resource.

    The Delta Lake Table Resource is a low-level access method to the table object. It gives you more fine-grained control and allows for modeling of more complex data. You can also use the Table Resource to run optimization and vacuuming jobs.

    "},{"location":"integrations/delta-lake-dagster/#schema-and-constraint-enforcement","title":"Schema and Constraint Enforcement","text":"

    Delta Lake provides built-in checks to ensure schema consistency when appending data to a table, as well as the ability to evolve the schema. This is a great feature for the Dagster community as it prevents bad data from being appended to tables, ensuring data consistency and accuracy.

    Read more about how to add constraints to a table in the Delta Lake documentation.

    "},{"location":"integrations/delta-lake-dagster/#z-ordering","title":"Z-Ordering","text":"

    Delta Lake offers Z-ordering functionality to colocate similar data in the same files. This can make your Delta Table queries much more efficient via file skipping. Dagster users can now benefit from this great feature through the Delta Lake I/O Manager.

    Read more about Z-Ordering on the Delta Lake blog.

    "},{"location":"integrations/delta-lake-dagster/#contribute","title":"Contribute","text":"

    To contribute to the Delta Lake and Dagster integration, go to [link]

    "},{"location":"integrations/delta-lake-dask/","title":"Using Delta Lake with Dask","text":"

    Delta Lake is a great storage format for Dask analyses. This page explains why and how to use Delta Lake with Dask.

    You will learn how to read Delta Lakes into Dask DataFrames, how to query Delta tables with Dask, and the unique advantages Delta Lake offers the Dask community.

    Here are some of the benefits that Delta Lake provides Dask users:

    • better performance with file skipping
    • enhanced file skipping via Z Ordering
    • ACID transactions for reliable writes
    • easy time-travel functionality

    \u2757\ufe0f dask-deltatable currently works with deltalake<=13.0. See https://github.com/dask-contrib/dask-deltatable/issues/65

    "},{"location":"integrations/delta-lake-dask/#install-dask-deltatable","title":"Install Dask-Deltatable","text":"

    To use Delta Lake with Dask, first install the library using

    pip install dask-deltatable\n
    "},{"location":"integrations/delta-lake-dask/#reading-delta-tables-into-a-dask-dataframe","title":"Reading Delta Tables into a Dask DataFrame","text":"

    You can read data stored in a Delta Lake into a Dask DataFrame using dask-deltatable.read_deltalake.

    Let's read in a toy dataset to see what we can do with Delta Lake and Dask. You can access the data stored as a Delta Lake on Github

    import dask_deltatable as ddt\n\n# read delta table into Dask DataFrame\ndelta_path = \"path/to/data/people_countries_delta_dask\"\nddf = ddt.read_deltalake(delta_path)\n

    Dask is a library for efficient distributed computing and works with lazy evaluation. Function calls to dask.dataframe build a task graph in the background. To trigger computation, call .compute():

    > ddf.compute()\n\n|    | first_name   | last_name   | country   | continent   |\n|---:|:-------------|:------------|:----------|:------------|\n|  0 | Ernesto      | Guevara     | Argentina | NaN         |\n|  0 | Bruce        | Lee         | China     | Asia        |\n|  1 | Jack         | Ma          | China     | Asia        |\n|  0 | Wolfgang     | Manche      | Germany   | NaN         |\n|  1 | Soraya       | Jala        | Germany   | NaN         |\n

    You can read in specific versions of Delta tables by specifying a version number or a timestamp:

    # with specific version\nddf = ddt.read_deltalake(delta_path, version=3)\n\n# with specific datetime\nddt.read_deltalake(delta_path, datetime=\"2018-12-19T16:39:57-08:00\")\n

    dask-deltatable also supports reading from remote sources like S3 with:

    ddt.read_deltalake(\"s3://bucket_name/delta_path\", version=3)\n

    To read data from remote sources you'll need to make sure the credentials are properly configured in environment variables or config files. Refer to your cloud provider documentation to configure these.

    "},{"location":"integrations/delta-lake-dask/#what-can-i-do-with-a-dask-deltatable","title":"What can I do with a Dask Deltatable?","text":"

    Reading a Delta Lake in with dask-deltatable returns a regular Dask DataFrame. You can perform all the regular Dask operations on this DataFrame.

    Let's take a look at the first few rows:

    > ddf.head(n=3)\n\n|    | first_name   | last_name   | country   |   continent |\n|---:|:-------------|:------------|:----------|------------:|\n|  0 | Ernesto      | Guevara     | Argentina |         nan |\n

    dask.dataframe.head() shows you the first rows of the first partition in the dataframe. In this case, the first partition only has 1 row.

    This is because the Delta Lake has been partitioned by country:

    > !ls ../../data/people_countries_delta_dask\n_delta_log        country=Argentina country=China     country=Germany\n

    dask-deltatable neatly reads in the partitioned Delta Lake into corresponding Dask DataFrame partitions:

    > # see number of partitions\n> ddf.npartitions\n3\n

    You can inspect a single partition using dask.dataframe.get_partition():

    > ddf.get_partition(n=1).compute()\n\n|    | first_name   | last_name   | country   | continent   |\n|---:|:-------------|:------------|:----------|:------------|\n|  0 | Bruce        | Lee         | China     | Asia        |\n|  1 | Jack         | Ma          | China     | Asia        |\n
    "},{"location":"integrations/delta-lake-dask/#perform-dask-operations","title":"Perform Dask Operations","text":"

    Let's perform some basic computations over the Delta Lake data that's now stored in our Dask DataFrame.

    Suppose you want to group the dataset by the country column:

    > ddf.groupby(['country']).count().compute()\n\n| country   |   first_name |   last_name |   continent |\n|:----------|-------------:|------------:|------------:|\n| Argentina |            1 |           1 |           1 |\n| China     |            2 |           2 |           2 |\n| Germany   |            2 |           2 |           2 |\n

    Dask executes this groupby operation in parallel across all available cores.

    "},{"location":"integrations/delta-lake-dask/#map-functions-over-partitions","title":"Map Functions over Partitions","text":"

    You can also use Dask's map_partitions method to map a custom Python function over all the partitions.

    Let's write a function that will replace the missing continent values with the right continent names.

    # define custom python function\n\n# get na_string\ndf = ddf.get_partition(0).compute()\nna_string = df.iloc[0].continent\nna_string\n\n# define function\ndef replace_proper(partition, na_string):\n    if [partition.country == \"Argentina\"]:\n        partition.loc[partition.country==\"Argentina\"] = partition.loc[partition.country==\"Argentina\"].replace(na_string, \"South America\")\n    if [partition.country == \"Germany\"]:\n        partition.loc[partition.country==\"Germany\"] = partition.loc[partition.country==\"Germany\"].replace(na_string, \"Europe\")\n    else:\n        pass\n    return partition\n

    Now map this over all partitions in the Dask DataFrame:

    # define metadata and map function over partitions\n> meta = dict(ddf.dtypes)\n> ddf3 = ddf.map_partitions(replace_proper, na_string, meta=meta)\n> ddf3.compute()\n\n|    | first_name   | last_name   | country   | continent     |\n|---:|:-------------|:------------|:----------|:--------------|\n|  0 | Ernesto      | Guevara     | Argentina | South America |\n|  0 | Bruce        | Lee         | China     | Asia          |\n|  1 | Jack         | Ma          | China     | Asia          |\n|  0 | Wolfgang     | Manche      | Germany   | Europe        |\n|  1 | Soraya       | Jala        | Germany   | Europe        |\n
    "},{"location":"integrations/delta-lake-dask/#write-to-delta-lake","title":"Write to Delta Lake","text":"

    After doing your data processing in Dask, you can write the data back out to Delta Lake using to_deltalake:

    ddt.to_deltalake(\"tmp/test_write\", ddf)\n
    "},{"location":"integrations/delta-lake-dask/#contribute-to-dask-deltalake","title":"Contribute to dask-deltalake","text":"

    To contribute, go to the dask-deltalake Github repository.

    "},{"location":"integrations/delta-lake-datafusion/","title":"Using Delta Lake with DataFusion","text":"

    This page explains how to use Delta Lake with DataFusion.

    Delta Lake offers DataFusion users better performance and more features compared to other formats like CSV or Parquet.

    Delta Lake works well with the DataFusion Rust API and the DataFusion Python API. It's a great option for all DataFusion users.

    Delta Lake also depends on DataFusion to implement SQL-related functionality under the hood. We will also discuss this dependency at the end of this guide in case you're interested in learning more about the symbiotic relationship between the two libraries.

    "},{"location":"integrations/delta-lake-datafusion/#delta-lake-performance-benefits-for-datafusion-users","title":"Delta Lake performance benefits for DataFusion users","text":"

    Let's run some DataFusion queries on a Parquet file and a Delta table with the same data to learn more about the performance benefits of Delta Lake.

    Suppose you have the following dataset with 1 billion rows and 9 columns. Here are the first three rows of data:

    +-------+-------+--------------+-------+-------+--------+------+------+---------+\n| id1   | id2   | id3          |   id4 |   id5 |    id6 |   v1 |   v2 |      v3 |\n|-------+-------+--------------+-------+-------+--------+------+------+---------|\n| id016 | id046 | id0000109363 |    88 |    13 | 146094 |    4 |    6 | 18.8377 |\n| id039 | id087 | id0000466766 |    14 |    30 | 111330 |    4 |   14 | 46.7973 |\n| id047 | id098 | id0000307804 |    85 |    23 | 187639 |    3 |    5 | 47.5773 |\n+-------+-------+--------------+-------+-------+--------+------+------+---------+\n

    Here's how to register a Delta Lake table as a PyArrow dataset:

    from datafusion import SessionContext\nfrom deltalake import DeltaTable\n\nctx = SessionContext()\ntable = DeltaTable(\"G1_1e9_1e2_0_0\")\nctx.register_dataset(\"my_delta_table\", table.to_pyarrow_dataset())\n

    Now query the table:

    ctx.sql(\"select id1, sum(v1) as v1 from my_delta_table where id1='id096' group by id1\")\n

    That query takes 2.8 seconds to execute.

    Let's register the same dataset as a Parquet table, run the same query, and compare the runtime difference.

    Register the Parquet table and run the query:

    path = \"G1_1e9_1e2_0_0.parquet\"\nctx.register_parquet(\"my_parquet_table\", path)\nctx.sql(\"select id1, sum(v1) as v1 from my_parquet_table where id1='id096' group by id1\")\n

    This query takes 5.3 seconds to run.

    Parquet stores data in row groups and DataFusion can intelligently skip row groups that don't contain relevant data, so the query is faster than a file format like CSV which doesn't support row group skipping.

    Delta Lake stores file-level metadata information in the transaction log, so it can skip entire files when queries are executed. Delta Lake can skip entire files and then skip row groups within the individual files. This makes Delta Lake even faster than Parquet files, especially for larger datasets spread across many files.

    "},{"location":"integrations/delta-lake-datafusion/#delta-lake-features-for-datafusion-users","title":"Delta Lake features for DataFusion users","text":"

    Delta Lake also provides other features that are useful for DataFusion users like ACID transactions, concurrency protection, time travel, versioned data, and more.

    "},{"location":"integrations/delta-lake-datafusion/#why-delta-lake-depends-on-datafusion","title":"Why Delta Lake depends on DataFusion","text":"

    Delta Lake depends on DataFusion to provide some end-user features.

    DataFusion is useful in providing SQL-related Delta Lake features. Some examples:

    • Update and merge are written in terms of SQL expressions.
    • Invariants and constraints are written in terms of SQL expressions.

    Anytime we have to evaluate SQL, we need some sort of SQL engine. We use DataFusion for that.

    "},{"location":"integrations/delta-lake-datafusion/#conclusion","title":"Conclusion","text":"

    Delta Lake is a great file format for DataFusion users.

    Delta Lake also uses DataFusion to provide some end-user features.

    DataFusion and Delta Lake have a wonderful symbiotic relationship and play very nicely with each other.

    See this guide for more information on Delta Lake and PyArrow and why PyArrow Datasets are often a better option than PyArrow tables.

    "},{"location":"integrations/delta-lake-pandas/","title":"Using Delta Lake with pandas","text":"

    Delta Lake is a great storage system for pandas analyses. This page shows how it's easy to use Delta Lake with pandas, the unique features Delta Lake offers pandas users, and how Delta Lake can make your pandas analyses run faster.

    Delta Lake is very easy to install for pandas analyses, just run pip install deltalake.

    Delta Lake allows for performance optimizations, so pandas queries can run much faster than the query run on data stored in CSV or Parquet. See the following chart for the query runtime for the a Delta tables compared with CSV/Parquet.

    Z Ordered Delta tables run this query much faster than when the data is stored in Parquet or CSV. Let's dive in deeper and see how Delta Lake makes pandas faster.

    "},{"location":"integrations/delta-lake-pandas/#delta-lake-makes-pandas-queries-run-faster","title":"Delta Lake makes pandas queries run faster","text":"

    There are a few reasons Delta Lake can make pandas queries run faster:

    1. column pruning: only grabbing the columns relevant for a query
    2. file skipping: only reading files with data for the query
    3. row group skipping: only reading row groups with data for the query
    4. Z ordering data: colocating similar data in the same files, so file skipping is more effective

    Reading less data (fewer columns and/or fewer rows) is how Delta Lake makes pandas queries run faster.

    Parquet allows for column pruning and row group skipping, but doesn't support file-level skipping or Z Ordering. CSV doesn't support any of these performance optimizations.

    Let's take a look at a sample dataset and run a query to see the performance enhancements offered by Delta Lake.

    Suppose you have a 1 billion row dataset with 9 columns, here are the first three rows of the dataset:

    +-------+-------+--------------+-------+-------+--------+------+------+---------+\n| id1   | id2   | id3          |   id4 |   id5 |    id6 |   v1 |   v2 |      v3 |\n|-------+-------+--------------+-------+-------+--------+------+------+---------|\n| id016 | id046 | id0000109363 |    88 |    13 | 146094 |    4 |    6 | 18.8377 |\n| id039 | id087 | id0000466766 |    14 |    30 | 111330 |    4 |   14 | 46.7973 |\n| id047 | id098 | id0000307804 |    85 |    23 | 187639 |    3 |    5 | 47.5773 |\n+-------+-------+--------------+-------+-------+--------+------+------+---------+\n

    The dataset is roughly 50 GB when stored as an uncompressed CSV files. Let's run some queries on a 2021 Macbook M1 with 64 GB of RAM.

    Start by running the query on an uncompressed CSV file:

    (\n    pd.read_csv(f\"{Path.home()}/data/G1_1e9_1e2_0_0.csv\", usecols=[\"id1\", \"id2\", \"v1\"])\n    .query(\"id1 == 'id016'\")\n    .groupby(\"id2\")\n    .agg({\"v1\": \"sum\"})\n)\n

    This query takes 234 seconds to execute. It runs out of memory if the usecols parameter is not set.

    Now let's convert the CSV dataset to Parquet and run the same query on the data stored in a Parquet file.

    (\n    pd.read_parquet(\n        f\"{Path.home()}/data/G1_1e9_1e2_0_0.parquet\", columns=[\"id1\", \"id2\", \"v1\"]\n    )\n    .query(\"id1 == 'id016'\")\n    .groupby(\"id2\")\n    .agg({\"v1\": \"sum\"})\n)\n

    This query takes 118 seconds to execute.

    Parquet stores data in row groups and allows for skipping when the filters predicates are set. Run the Parquet query again with row group skipping enabled:

    (\n    pd.read_parquet(\n        f\"{Path.home()}/data/G1_1e9_1e2_0_0.parquet\",\n        columns=[\"id1\", \"id2\", \"v1\"],\n        filters=[(\"id1\", \"==\", \"id016\")],\n    )\n    .query(\"id1 == 'id016'\")\n    .groupby(\"id2\")\n    .agg({\"v1\": \"sum\"})\n)\n

    This query runs in 19 seconds. Lots of row groups can be skipped for this particular query.

    Now let's run the same query on a Delta table to see the out-of-the box performance:

    (\n    DeltaTable(f\"{Path.home()}/data/deltalake_baseline_G1_1e9_1e2_0_0\", version=0)\n    .to_pandas(filters=[(\"id1\", \"==\", \"id016\")], columns=[\"id1\", \"id2\", \"v1\"])\n    .query(\"id1 == 'id016'\")\n    .groupby(\"id2\")\n    .agg({\"v1\": \"sum\"})\n)\n

    This query runs in 8 seconds, which is a significant performance enhancement.

    Now let's Z Order the Delta table by id1 which will make the data skipping even better. Run the query again on the Z Ordered Delta table:

    (\n    DeltaTable(f\"{Path.home()}/data/deltalake_baseline_G1_1e9_1e2_0_0\", version=1)\n    .to_pandas(filters=[(\"id1\", \"==\", \"id016\")], columns=[\"id1\", \"id2\", \"v1\"])\n    .query(\"id1 == 'id016'\")\n    .groupby(\"id2\")\n    .agg({\"v1\": \"sum\"})\n)\n

    The query now executes in 2.4 seconds.

    Delta tables can make certain pandas queries run much faster.

    "},{"location":"integrations/delta-lake-pandas/#delta-lake-lets-pandas-users-time-travel","title":"Delta Lake lets pandas users time travel","text":"

    Start by creating a Delta table:

    from deltalake import write_deltalake, DeltaTable\n\ndf = pd.DataFrame({\"num\": [1, 2, 3], \"letter\": [\"a\", \"b\", \"c\"]})\nwrite_deltalake(\"tmp/some-table\", df)\n

    Here are the contents of the Delta table (version 0 of the Delta table):

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n+-------+----------+\n

    Now append two rows to the Delta table:

    df = pd.DataFrame({\"num\": [8, 9], \"letter\": [\"dd\", \"ee\"]})\nwrite_deltalake(\"tmp/some-table\", df, mode=\"append\")\n

    Here are the contents after the append operation (version 1 of the Delta table):

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n|     8 | dd       |\n|     9 | ee       |\n+-------+----------+\n

    Now perform an overwrite transaction:

    df = pd.DataFrame({\"num\": [11, 22], \"letter\": [\"aa\", \"bb\"]})\nwrite_deltalake(\"tmp/some-table\", df, mode=\"overwrite\")\n

    Here are the contents after the overwrite operation (version 2 of the Delta table):

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     8 | dd       |\n|     9 | ee       |\n+-------+----------+\n

    Read in the Delta table and it will grab the latest version by default:

    DeltaTable(\"tmp/some-table\").to_pandas()\n\n+-------+----------+\n|   num | letter   |\n|-------+----------|\n|    11 | aa       |\n|    22 | bb       |\n+-------+----------+\n

    You can easily time travel back to version 0 of the Delta table:

    DeltaTable(\"tmp/some-table\", version=0).to_pandas()\n\n+-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n+-------+----------+\n

    You can also time travel to version 1 of the Delta table:

    DeltaTable(\"tmp/some-table\", version=1).to_pandas()\n\n+-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n|     8 | dd       |\n|     9 | ee       |\n+-------+----------+\n

    Time travel is a powerful feature that pandas users cannot access with CSV or Parquet.

    "},{"location":"integrations/delta-lake-pandas/#schema-enforcement","title":"Schema enforcement","text":"

    Delta tables only allow you to append DataFrame with matching schema by default. Suppose you have a DataFrame with num and animal columns, which is different from the Delta table that has columns with num and letter columns.

    Try to append this DataFrame with a mismatched schema to the existing table:

    df = pd.DataFrame({\"num\": [5, 6], \"animal\": [\"cat\", \"dog\"]})\nwrite_deltalake(\"tmp/some-table\", df)\n

    This transaction will be rejected and will return the following error message:

    ValueError: Schema of data does not match table schema\nData schema:\nnum: int64\nanimal: string\n-- schema metadata --\npandas: '{\"index_columns\": [{\"kind\": \"range\", \"name\": null, \"start\": 0, \"' + 474\nTable Schema:\nnum: int64\nletter: string\n

    Schema enforcement protects your table from getting corrupted by appending data with mismatched schema. Parquet and CSV don't offer schema enforcement for pandas users.

    "},{"location":"integrations/delta-lake-pandas/#overwriting-schema-of-table","title":"Overwriting schema of table","text":"

    You can overwrite the table contents and schema by setting the schema_mode option. Here's how to overwrite the table contents:

    write_deltalake(\"tmp/some-table\", df, mode=\"overwrite\", schema_mode=\"overwrite\")\n

    Here are the contents of the table after the values and schema have been overwritten:

    +-------+----------+\n|   num | animal   |\n|-------+----------|\n|     5 | cat      |\n|     6 | dog      |\n+-------+----------+\n

    If you want the schema to be merged instead, specify schema_mode=\"merge\".

    "},{"location":"integrations/delta-lake-pandas/#in-memory-vs-in-storage-data-changes","title":"In-memory vs. in-storage data changes","text":"

    It's important to distinguish between data stored in-memory and data stored on disk when understanding the functionality offered by Delta Lake.

    pandas loads data from storage (CSV, Parquet, or Delta Lake) into in-memory DataFrames.

    pandas makes it easy to modify the data in memory, say update a column value. It's not easy to update a column value in storage systems like CSV or Parquet using pandas.

    Delta Lake makes it easy for pandas users to update data in storage.

    "},{"location":"integrations/delta-lake-pandas/#why-delta-lake-allows-for-faster-queries","title":"Why Delta Lake allows for faster queries","text":"

    Delta tables store data in many files and metadata about the files in the transaction log. Delta Lake allows for certain queries to skip entire files, which makes pandas queries run much faster.

    "},{"location":"integrations/delta-lake-pandas/#more-resources","title":"More resources","text":"

    See this talk on why Delta Lake is the best file format for pandas analyses to learn more:

    "},{"location":"integrations/delta-lake-pandas/#conclusion","title":"Conclusion","text":"

    Delta Lake provides many features that make it an excellent format for pandas analyses:

    • performance optimizations make pandas queries run faster
    • data management features make pandas analyses more reliable
    • advanced features allow you to perform more complex pandas analyses

    Python deltalake offers pandas users a better experience compared with CSV/Parquet.

    "},{"location":"integrations/delta-lake-polars/","title":"Using Delta Lake with polars","text":"

    This page explains why Delta Lake is a great storage system for Polars analyses.

    You will learn how to create Delta tables with Polars, how to query Delta tables with Polars, and the unique advantages Delta Lake offers the Polars community.

    Here are some amazing benefits that Delta Lake provides Polars users:

    • time travel
    • ACID transactions for reliable writes
    • better performance with file skipping
    • enhanced file skipping via Z Ordering
    • ability to rollback mistakes
    • and many, many more

    Let's start by showing how to use Polars with Delta Lake, explore how Delta Lake can make Polars queries run faster, and then look at all the cool features Delta Lake offers Polars users.

    "},{"location":"integrations/delta-lake-polars/#creating-a-delta-lake-table-with-polars","title":"Creating a Delta Lake table with Polars","text":"

    Create a Polars DataFrame and write it out to a Delta table:

    import polars as pl\n\ndf = pl.DataFrame({\"x\": [1, 2, 3]})\ndf.write_delta(\"tmp/bear_delta_lake\")\n

    Inspect the contents of the Delta table:

    print(pl.read_delta(\"tmp/bear_delta_lake\"))\n\n+-----+\n| x   |\n| --- |\n| i64 |\n+=====+\n| 1   |\n| 2   |\n| 3   |\n+-----+\n

    Now create another Polars DataFrame and append it to the existing Delta table:

    df2 = pl.DataFrame({\"x\": [8, 9, 10]})\ndf2.write_delta(\"tmp/bear_delta_lake\", mode=\"append\")\n

    Re-inspect the contents of the Delta table:

    print(pl.read_delta(\"tmp/bear_delta_lake\"))\n\n+-----+\n| x   |\n| --- |\n| i64 |\n+=====+\n| 1   |\n| 2   |\n| 3   |\n| 8   |\n| 9   |\n| 10  |\n+-----+\n

    Now overwrite the existing Delta table:

    df3 = pl.DataFrame({\"x\": [55, 66, 77]})\ndf3.write_delta(\"tmp/bear_delta_lake\", mode=\"overwrite\")\n

    Inspect the Delta table:

    print(pl.read_delta(\"tmp/bear_delta_lake\"))\n\n+-----+\n| x   |\n| --- |\n| i64 |\n+=====+\n| 55  |\n| 66  |\n| 77  |\n+-----+\n

    The Delta table now has three versions, as shown in the following diagram:

    "},{"location":"integrations/delta-lake-polars/#time-travel-with-delta-lake-for-polars","title":"Time travel with Delta Lake for Polars","text":"

    Time travel back to version 0 of the Delta table:

    print(pl.read_delta(\"tmp/bear_delta_lake\", version=0))\n\n+-----+\n| x   |\n| --- |\n| i64 |\n+=====+\n| 1   |\n| 2   |\n| 3   |\n+-----+\n

    Time travel back to version 1 of the Delta table:

    print(pl.read_delta(\"tmp/bear_delta_lake\", version=1))\n\n+-----+\n| x   |\n| --- |\n| i64 |\n+=====+\n| 1   |\n| 2   |\n| 3   |\n| 9   |\n| 8   |\n| 10  |\n+-----+\n

    Read the Delta table without specifying a version and see how it reads the latest version by default:

    print(pl.read_delta(\"tmp/bear_delta_lake\"))\n\n+-----+\n| x   |\n| --- |\n| i64 |\n+=====+\n| 55  |\n| 66  |\n| 77  |\n+-----+\n

    Let's dive into how to read Delta tables with Polars in more detail and compare the query runtime performance on larger datasets.

    "},{"location":"integrations/delta-lake-polars/#reading-a-delta-lake-table-with-polars","title":"Reading a Delta Lake table with Polars","text":"

    Let's look at the h2o groupby dataset that has 1 billion rows and 9 columns. Here are the first three rows of the dataset:

    +-------+-------+--------------+-------+-------+--------+------+------+---------+\n| id1   | id2   | id3          |   id4 |   id5 |    id6 |   v1 |   v2 |      v3 |\n|-------+-------+--------------+-------+-------+--------+------+------+---------|\n| id016 | id046 | id0000109363 |    88 |    13 | 146094 |    4 |    6 | 18.8377 |\n| id039 | id087 | id0000466766 |    14 |    30 | 111330 |    4 |   14 | 46.7973 |\n| id047 | id098 | id0000307804 |    85 |    23 | 187639 |    3 |    5 | 47.5773 |\n+-------+-------+--------------+-------+-------+--------+------+------+---------+\n

    This dataset is 50GB when stored in an uncompressed CSV file. Let's run some queries on this dataset when it's stored in different file formats with Polars.

    This section will show the runtime for a query when the data is stored in CSV, Parquet, and Delta Lake and explain why Delta tables are the fastest.

    Start by running a query on an uncompressed CSV file with read_csv:

    pl.read_csv(\"~/data/G1_1e9_1e2_0_0.csv\").filter(pl.col(\"id1\") < \"id016\").group_by(\n    [\"id1\", \"id2\"]\n).agg(pl.sum(\"v1\").alias(\"v1_sum\")).collect()\n

    This query errors out after running for several minutes. The machine runs out of memory. Let's try it again with scan_csv.

    pl.scan_csv(\"~/data/G1_1e9_1e2_0_0.csv\").filter(pl.col(\"id1\") < \"id016\").group_by(\n    [\"id1\", \"id2\"]\n).agg(pl.sum(\"v1\").alias(\"v1_sum\")).collect()\n

    This query runs in 56.2 seconds.

    Now let's run the same query when the data is stored in a Parquet file:

    pl.scan_parquet(\"~/data/G1_1e9_1e2_0_0.parquet\").filter(\n    pl.col(\"id1\") < \"id016\"\n).group_by([\"id1\", \"id2\"]).agg(pl.sum(\"v1\").alias(\"v1_sum\")).collect()\n

    This query runs in 8.3 seconds. It's much faster because Polars is optimized to skip row groups in Parquet files that don't contain data that's relevant for the query.

    Then run the query on newly created Delta table:

    pl.scan_delta(\"~/data/deltalake/G1_1e9_1e2_0_0\", version=1).filter(\n    pl.col(\"id1\") < \"id016\"\n).group_by([\"id1\", \"id2\"]).agg(pl.sum(\"v1\").alias(\"v1_sum\")).collect()\n

    This query runs in 7.2 seconds. Polars can run this query faster because it can inspect the Delta transaction log and skip entire files that don't contain relevant data before performing the ordinary Parquet row group skipping.

    Finally run the query on the Delta table after it has been Z Ordered by id1:

    pl.scan_delta(\"~/data/deltalake/G1_1e9_1e2_0_0\", version=2).filter(\n    pl.col(\"id1\") < \"id016\"\n).group_by([\"id1\", \"id2\"]).agg(pl.sum(\"v1\").alias(\"v1_sum\")).collect()\n

    This query runs in 3.5 seconds. The query on the Z Ordered Delta table is even faster because similar data has been co-located in the same files. This allows for even greater data skipping.

    Polars can leverage file skipping to query Delta tables very quickly.

    "},{"location":"integrations/delta-lake-polars/#why-polars-is-fast-with-delta-lake","title":"Why Polars is fast with Delta Lake","text":"

    Delta tables consist of metadata in a transaction log and data stored in Parquet files.

    When Polars queries a Delta table, it starts by consulting the transaction log to understand the metadata of each file in the Delta table. This allows for Polars to quickly identify which files should be skipped by the query.

    CSV files don't contain any such metadata, so file skipping isn't an option. Polars can skip Parquet files based on metadata, but it needs to open up each file and read the metadata, which is slower that grabbing the file-level metadata directly from the transaction log.

    Parquet doesn't allow users to easily Z Order the data and colocate similar data in the same row groups. The Z Order optimizations are only supported in Delta tables.

    Delta Lake offers Polars users with unique performance optimizations.

    "},{"location":"integrations/delta-lake-polars/#other-delta-lake-features-relevant-for-polars-users","title":"Other Delta Lake features relevant for Polars users","text":"
    • ACID transactions for reliable writes
    • better performance with file skipping
    • enhanced file skipping via Z Ordering
    • ability to rollback mistakes
    "},{"location":"integrations/delta-lake-polars/#conclusion","title":"Conclusion","text":"

    This guide shows how Delta Lake is a great storage format for Polars analyses.

    Delta Lake is easy to use, fast, and full of features that are great for Polars users.

    "},{"location":"integrations/object-storage/hdfs/","title":"HDFS Storage Backend","text":"

    HDFS support is provided via the hdfs-native-object-store package, which sits on top of hdfs-native. This is an HDFS client written from scratch in Rust, with no bindings to libhdfs or any use of Java. While it supports most common cluster configurations, it does not support every possible client configuration that could exist.

    "},{"location":"integrations/object-storage/hdfs/#supported-configurations","title":"Supported Configurations","text":"

    By default, the client looks for existing Hadoop configs in following manner: - If the HADOOP_CONF_DIR environment variable is defined, load configs from $HADOOP_CONF_DIR/core-site.xml and $HADOOP_CONF_DIR/hdfs-site.xml - Otherwise, if the HADOOP_HOME environment variable is set, load configs from $HADOOP_HOME/etc/hadoop/core-site.xml and $HADOOP_HOME/etc/hadoop/hdfs-site.xml

    Additionally, you can pass Hadoop configs as storage_options and these will take precedence over the above configs.

    Currently the supported client configuration parameters are: - dfs.ha.namenodes.* - name service support - dfs.namenode.rpc-address.* - name service support - fs.viewfs.mounttable.*.link.* - ViewFS links - fs.viewfs.mounttable.*.linkFallback - ViewFS link fallback

    If you find your setup is not supported, please file an issue in the hdfs-native repository.

    "},{"location":"integrations/object-storage/hdfs/#secure-clusters","title":"Secure Clusters","text":"

    The client supports connecting to secure clusters through both Kerberos authentication as well as token authentication, and all SASL protection types are supported. The highest supported protection mechanism advertised by the server will be used.

    "},{"location":"integrations/object-storage/hdfs/#kerberos-support","title":"Kerberos Support","text":"

    Kerberos is supported through dynamically loading the libgssapi_krb5 library. This must be installed separately through your package manager, and currently only works on Linux and Mac.

    Debian-based systems:

    apt-get install libgssapi-krb5-2\n

    RHEL-based systems:

    yum install krb5-libs\n

    MacOS:

    brew install krb5\n

    Then simply kinit to get your TGT and authentication to HDFS should just work.

    "},{"location":"integrations/object-storage/hdfs/#token-support","title":"Token Support","text":"

    Token authentication is supported by looking for a token file located at the environment variable HADOOP_TOKEN_FILE_LOCATION. This is the location systems like YARN will automatically place a delegation token, so things will just work inside of YARN jobs.

    "},{"location":"integrations/object-storage/hdfs/#issues","title":"Issues","text":"

    If you face any HDFS-specific issues, please report to the hdfs-native-object-store repository.

    "},{"location":"usage/appending-overwriting-delta-lake-table/","title":"Appending to and overwriting a Delta Lake table","text":"

    This section explains how to append to an exising Delta table and how to overwrite a Delta table.

    "},{"location":"usage/appending-overwriting-delta-lake-table/#delta-lake-append-transactions","title":"Delta Lake append transactions","text":"

    Suppose you have a Delta table with the following contents:

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n+-------+----------+\n

    Append two additional rows of data to the table:

    from deltalake import write_deltalake, DeltaTable\n\ndf = pd.DataFrame({\"num\": [8, 9], \"letter\": [\"dd\", \"ee\"]})\nwrite_deltalake(\"tmp/some-table\", df, mode=\"append\")\n

    Here are the updated contents of the Delta table:

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n|     8 | dd       |\n|     9 | ee       |\n+-------+----------+\n

    Now let's see how to perform an overwrite transaction.

    "},{"location":"usage/appending-overwriting-delta-lake-table/#delta-lake-overwrite-transactions","title":"Delta Lake overwrite transactions","text":"

    Now let's see how to overwrite the exisitng Delta table.

    df = pd.DataFrame({\"num\": [11, 22], \"letter\": [\"aa\", \"bb\"]})\nwrite_deltalake(\"tmp/some-table\", df, mode=\"overwrite\")\n

    Here are the contents of the Delta table after the overwrite operation:

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|    11 | aa       |\n|    22 | bb       |\n+-------+----------+\n

    Overwriting just performs a logical delete. It doesn't physically remove the previous data from storage. Time travel back to the previous version to confirm that the old version of the table is still accessable.

    dt = DeltaTable(\"tmp/some-table\", version=1)\n\n+-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n|     8 | dd       |\n|     9 | ee       |\n+-------+----------+\n
    "},{"location":"usage/constraints/","title":"Adding a Constraint to a table","text":"

    Check constraints are a way to enforce that only data that meets the constraint is allowed to be added to the table.

    "},{"location":"usage/constraints/#add-the-constraint","title":"Add the Constraint","text":"Python Rust

    DeltaTable

    from deltalake import DeltaTable\n\ndt = DeltaTable(\"../rust/tests/data/simple_table\")\n\n# Check the schema before hand\nprint(dt.schema())\n# Add the constraint to the table.\ndt.alter.add_constraint({\"id_gt_0\": \"id > 0\"})\n

    DeltaTable

    let table = deltalake::open_table(\"../rust/tests/data/simple_table\").await?;\nlet ops = DeltaOps(table);\nops.with_constraint(\"id_gt_0\", \"id > 0\").await?;\n

    After you have added the constraint to the table attempting to append data to the table that violates the constraint will instead throw an error.

    "},{"location":"usage/constraints/#verify-the-constraint-by-trying-to-add-some-data","title":"Verify the constraint by trying to add some data","text":"Python Rust
    from deltalake import write_deltalake, DeltaTable\nimport pandas as pd\n\ndt = DeltaTable(\"../rust/tests/data/simple_table\")\n\ndf = pd.DataFrame({\"id\": [-1]})\nwrite_deltalake(dt, df, mode=\"append\", engine=\"rust\")\n# _internal.DeltaProtocolError: Invariant violations: [\"Check or Invariant (id > 0) violated by value in row: [-1]\"]\n
    let table = deltalake::open_table(\"../rust/tests/data/simple_table\").await?;\nlet schema = table.get_state().arrow_schema()?;\nlet invalid_values: Vec<Arc<dyn Array>> = vec![\n    Arc::new(Int32Array::from(vec![-10]))\n];\nlet batch = RecordBatch::try_new(schema, invalid_values)?;\ntable.write(vec![batch]).await?;\n

    Note: ensure you use the engine='rust' parameter when writing to the table as this feature is not supported in the default pyarrow writer.

    "},{"location":"usage/create-delta-lake-table/","title":"Creating a Delta Lake Table","text":"

    This section explains how to create a Delta Lake table.

    You can easily write a DataFrame to a Delta table.

    pandasPolars
    from deltalake import write_deltalake\nimport pandas as pd\n\ndf = pd.DataFrame({\"num\": [1, 2, 3], \"letter\": [\"a\", \"b\", \"c\"]})\nwrite_deltalake(\"tmp/some-table\", df)\n
    import polars as pl\n\ndf = pl.DataFrame({\"num\": [1, 2, 3], \"letter\": [\"a\", \"b\", \"c\"]})\ndf.write_delta(\"tmp/some-table\")\n

    Here are the contents of the Delta table in storage:

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n+-------+----------+\n
    "},{"location":"usage/deleting-rows-from-delta-lake-table/","title":"Deleting rows from a Delta Lake table","text":"

    This section explains how to delete rows from a Delta Lake table.

    Suppose you have the following Delta table with four rows:

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n|     3 | c        |\n|     4 | d        |\n+-------+----------+\n

    Here's how to delete all the rows where the num is greater than 2:

    dt = DeltaTable(\"tmp/my-table\")\ndt.delete(\"num > 2\")\n

    Here are the contents of the Delta table after the delete operation has been performed:

    +-------+----------+\n|   num | letter   |\n|-------+----------|\n|     1 | a        |\n|     2 | b        |\n+-------+----------+\n

    dt.delete() accepts any SQL where clause. If no predicate is provided, all rows will be deleted.

    Read more in the API docs

    "},{"location":"usage/examining-table/","title":"Examining a Table","text":""},{"location":"usage/examining-table/#metadata","title":"Metadata","text":"

    The delta log maintains basic metadata about a table, including:

    • A unique id
    • A name, if provided
    • A description, if provided
    • The list of partitionColumns.
    • The created_time of the table
    • A map of table configuration. This includes fields such as delta.appendOnly, which if true indicates the table is not meant to have data deleted from it.

    Get metadata from a table with the DeltaTable.metadata() method:

    >>> from deltalake import DeltaTable\n>>> dt = DeltaTable(\"../rust/tests/data/simple_table\")\n>>> dt.metadata()\nMetadata(id: 5fba94ed-9794-4965-ba6e-6ee3c0d22af9, name: None, description: None, partitionColumns: [], created_time: 1587968585495, configuration={})\n
    "},{"location":"usage/examining-table/#schema","title":"Schema","text":"

    The schema for the table is also saved in the transaction log. It can either be retrieved in the Delta Lake form as Schema or as a PyArrow schema. The first allows you to introspect any column-level metadata stored in the schema, while the latter represents the schema the table will be loaded into.

    Use DeltaTable.schema to retrieve the delta lake schema:

    >>> from deltalake import DeltaTable\n>>> dt = DeltaTable(\"../rust/tests/data/simple_table\")\n>>> dt.schema()\nSchema([Field(id, PrimitiveType(\"long\"), nullable=True)])\n

    These schemas have a JSON representation that can be retrieved. To reconstruct from json, use DeltaTable.schema.to_json().

    >>> dt.schema().to_json()\n'{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]}'\n

    Use DeltaTable.schema.to_pyarrow() to retrieve the PyArrow schema:

    >>> dt.schema().to_pyarrow()\nid: int64\n
    "},{"location":"usage/examining-table/#history","title":"History","text":"

    Depending on what system wrote the table, the delta table may have provenance information describing what operations were performed on the table, when, and by whom. This information is retained for 30 days by default, unless otherwise specified by the table configuration delta.logRetentionDuration.

    Note

    This information is not written by all writers and different writers may use different schemas to encode the actions. For Spark\\'s format, see: https://docs.delta.io/latest/delta-utility.html#history-schema

    To view the available history, use DeltaTable.history:

    from deltalake import DeltaTable\n\ndt = DeltaTable(\"../rust/tests/data/simple_table\")\ndt.history()\n
    [{'timestamp': 1587968626537, 'operation': 'DELETE', 'operationParameters': {'predicate': '[\"((`id` % CAST(2 AS BIGINT)) = CAST(0 AS BIGINT))\"]'}, 'readVersion': 3, 'isBlindAppend': False},\n {'timestamp': 1587968614187, 'operation': 'UPDATE', 'operationParameters': {'predicate': '((id#697L % cast(2 as bigint)) = cast(0 as bigint))'}, 'readVersion': 2, 'isBlindAppend': False},\n {'timestamp': 1587968604143, 'operation': 'WRITE', 'operationParameters': {'mode': 'Overwrite', 'partitionBy': '[]'}, 'readVersion': 1, 'isBlindAppend': False},\n {'timestamp': 1587968596254, 'operation': 'MERGE', 'operationParameters': {'predicate': '(oldData.`id` = newData.`id`)'}, 'readVersion': 0, 'isBlindAppend': False},\n {'timestamp': 1587968586154, 'operation': 'WRITE', 'operationParameters': {'mode': 'ErrorIfExists', 'partitionBy': '[]'}, 'isBlindAppend': True}]\n
    "},{"location":"usage/examining-table/#current-add-actions","title":"Current Add Actions","text":"

    The active state for a delta table is determined by the Add actions, which provide the list of files that are part of the table and metadata about them, such as creation time, size, and statistics. You can get a data frame of the add actions data using DeltaTable.get_add_actions:

    >>> from deltalake import DeltaTable\n>>> dt = DeltaTable(\"../rust/tests/data/delta-0.8.0\")\n>>> dt.get_add_actions(flatten=True).to_pandas()\n                                                    path  size_bytes   modification_time  data_change  num_records  null_count.value  min.value  max.value\n0  part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a...         440 2021-03-06 15:16:07         True            2                 0          0          2\n1  part-00000-04ec9591-0b73-459e-8d18-ba5711d6cbe...         440 2021-03-06 15:16:16         True            2                 0          2          4\n

    This works even with past versions of the table:

    >>> dt = DeltaTable(\"../rust/tests/data/delta-0.8.0\", version=0)\n>>> dt.get_add_actions(flatten=True).to_pandas()\n                                                path  size_bytes   modification_time  data_change  num_records  null_count.value  min.value  max.value\n0  part-00000-c9b90f86-73e6-46c8-93ba-ff6bfaf892a...         440 2021-03-06 15:16:07         True            2                 0          0          2\n1  part-00001-911a94a2-43f6-4acb-8620-5e68c265498...         445 2021-03-06 15:16:07         True            3                 0          2          4\n
    "},{"location":"usage/installation/","title":"Installation","text":"

    The deltalake project can be installed via pip for Python or Cargo for Rust.

    "},{"location":"usage/installation/#install-delta-lake-for-python","title":"Install Delta Lake for Python","text":"

    With pip:

    pip install deltalake\n

    With Conda:

    conda install -c conda-forge deltalake\n
    "},{"location":"usage/installation/#install-delta-lake-for-rust","title":"Install Delta Lake for Rust","text":"

    With Cargo:

    cargo add deltalake\n
    "},{"location":"usage/installation/#run-delta-lake-and-pandas-in-a-jupyter-notebook","title":"Run Delta Lake and pandas in a Jupyter Notebook","text":"

    You can easily run Delta Lake and pandas in a Jupyter notebook.

    Create an environment file with the dependencies as follows:

    name: deltalake-minimal\nchannels:\n  - conda-forge\n  - defaults\ndependencies:\n  - python=3.11\n  - ipykernel\n  - pandas\n  - polars\n  - jupyterlab\n  - deltalake\n

    Create a virtual environment with the dependencies:

    conda env create -f deltalake-minimal.yml\n

    Open the Jupyter notebook and run commands as follows:

    "},{"location":"usage/loading-table/","title":"Loading a Delta Table","text":"

    A DeltaTable represents the state of a delta table at a particular version. This includes which files are currently part of the table, the schema of the table, and other metadata such as creation time.

    Python Rust

    DeltaTable

    from deltalake import DeltaTable\n\ndt = DeltaTable(\"../rust/tests/data/delta-0.2.0\")\nprint(f\"Version: {dt.version()}\")\nprint(f\"Files: {dt.files()}\")\n

    DeltaTable

    let table = deltalake::open_table(\"../rust/tests/data/simple_table\").await.unwrap();\nprintln!(\"Version: {}\", table.version());\nprintln!(\"Files: {}\", table.get_files());\n

    Depending on your storage backend, you could use the storage_options parameter to provide some configuration. Configuration is defined for specific backends - s3 options, azure options, gcs options.

    >>> storage_options = {\"AWS_ACCESS_KEY_ID\": \"THE_AWS_ACCESS_KEY_ID\", \"AWS_SECRET_ACCESS_KEY\":\"THE_AWS_SECRET_ACCESS_KEY\"}\n>>> dt = DeltaTable(\"../rust/tests/data/delta-0.2.0\", storage_options=storage_options)\n

    The configuration can also be provided via the environment, and the basic service provider is derived from the URL being used. We try to support many of the well-known formats to identify basic service properties.

    S3:

    • s3://\\<bucket>/\\<path>
    • s3a://\\<bucket>/\\<path>

    Note that delta-rs does not read credentials from a local .aws/config or .aws/creds file. Credentials can be accessed from environment variables, ec2 metadata, profiles or web identity. You can also pass credentials to storage_options using AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.

    Azure:

    • az://\\<container>/\\<path>
    • adl://\\<container>/\\<path>
    • abfs://\\<container>/\\<path>

    GCS:

    • gs://\\<bucket>/\\<path>

    Alternatively, if you have a data catalog you can load it by reference to a database and table name. Currently only AWS Glue is supported.

    For AWS Glue catalog, use AWS environment variables to authenticate.

    >>> from deltalake import DeltaTable\n>>> from deltalake import DataCatalog\n>>> database_name = \"simple_database\"\n>>> table_name = \"simple_table\"\n>>> data_catalog = DataCatalog.AWS\n>>> dt = DeltaTable.from_data_catalog(data_catalog=data_catalog, database_name=database_name, table_name=table_name)\n>>> dt.to_pyarrow_table().to_pydict()\n{'id': [5, 7, 9, 5, 6, 7, 8, 9]}\n
    "},{"location":"usage/loading-table/#custom-storage-backends","title":"Custom Storage Backends","text":"

    While delta always needs its internal storage backend to work and be properly configured, in order to manage the delta log, it may sometime be advantageous - and is common practice in the arrow world - to customize the storage interface used for reading the bulk data.

    deltalake will work with any storage compliant with pyarrow.fs.FileSystem, however the root of the filesystem has to be adjusted to point at the root of the Delta table. We can achieve this by wrapping the custom filesystem into a pyarrow.fs.SubTreeFileSystem.

    import pyarrow.fs as fs\nfrom deltalake import DeltaTable\n\npath = \"<path/to/table>\"\nfilesystem = fs.SubTreeFileSystem(path, fs.LocalFileSystem())\n\ndt = DeltaTable(path)\nds = dt.to_pyarrow_dataset(filesystem=filesystem)\n

    When using the pyarrow factory method for file systems, the normalized path is provided on creation. In case of S3 this would look something like:

    import pyarrow.fs as fs\nfrom deltalake import DeltaTable\n\ntable_uri = \"s3://<bucket>/<path>\"\nraw_fs, normalized_path = fs.FileSystem.from_uri(table_uri)\nfilesystem = fs.SubTreeFileSystem(normalized_path, raw_fs)\n\ndt = DeltaTable(table_uri)\nds = dt.to_pyarrow_dataset(filesystem=filesystem)\n
    "},{"location":"usage/loading-table/#time-travel","title":"Time Travel","text":"

    To load previous table states, you can provide the version number you wish to load:

    >>> dt = DeltaTable(\"../rust/tests/data/simple_table\", version=2)\n

    Once you\\'ve loaded a table, you can also change versions using either a version number or datetime string:

    >>> dt.load_version(1)\n>>> dt.load_with_datetime(\"2021-11-04 00:05:23.283+00:00\")\n

    Warning

    Previous table versions may not exist if they have been vacuumed, in which case an exception will be thrown. See Vacuuming tables for more information.

    "},{"location":"usage/managing-tables/","title":"Managing Delta Tables","text":""},{"location":"usage/managing-tables/#vacuuming-tables","title":"Vacuuming tables","text":"

    Vacuuming a table will delete any files that have been marked for deletion. This may make some past versions of a table invalid, so this can break time travel. However, it will save storage space. Vacuum will retain files in a certain window, by default one week, so time travel will still work in shorter ranges.

    Delta tables usually don't delete old files automatically, so vacuuming regularly is considered good practice, unless the table is only appended to.

    Use DeltaTable.vacuum to perform the vacuum operation. Note that to prevent accidental deletion, the function performs a dry-run by default: it will only list the files to be deleted. Pass dry_run=False to actually delete files.

    >>> dt = DeltaTable(\"../rust/tests/data/simple_table\")\n>>> dt.vacuum()\n['../rust/tests/data/simple_table/part-00006-46f2ff20-eb5d-4dda-8498-7bfb2940713b-c000.snappy.parquet',\n '../rust/tests/data/simple_table/part-00190-8ac0ae67-fb1d-461d-a3d3-8dc112766ff5-c000.snappy.parquet',\n '../rust/tests/data/simple_table/part-00164-bf40481c-4afd-4c02-befa-90f056c2d77a-c000.snappy.parquet',\n ...]\n>>> dt.vacuum(dry_run=False) # Don't run this unless you are sure!\n
    "},{"location":"usage/managing-tables/#optimizing-tables","title":"Optimizing tables","text":"

    Optimizing tables is not currently supported.

    "},{"location":"usage/overview/","title":"Usage","text":"

    This guide teaches you how to use Delta Lake. You will learn how to create Delta tables, run queries, perform DML operations, and optimize your tables.

    It's easy to use Delta Lake with pandas, Polars, Rust, or any other PyArrow-like DataFrame library.

    See the Spark Delta Lake documentation if you're using Delta Lake with Spark.

    "},{"location":"usage/querying-delta-tables/","title":"Querying Delta Tables","text":"

    Delta tables can be queried in several ways. By loading as Arrow data or an Arrow dataset, they can be used by compatible engines such as Pandas and DuckDB. By passing on the list of files, they can be loaded into other engines such as Dask.

    Delta tables are often larger than can fit into memory on a single computer, so this module provides ways to read only the parts of the data you need. Partition filters allow you to skip reading files that are part of irrelevant partitions. Only loading the columns required also saves memory. Finally, some methods allow reading tables batch-by-batch, allowing you to process the whole table while only having a portion loaded at any given time.

    To load into Pandas or a PyArrow table use the DeltaTable.to_pandas and DeltaTable.to_pyarrow_table methods, respectively. Both of these support filtering partitions and selecting particular columns.

    >>> from deltalake import DeltaTable\n>>> dt = DeltaTable(\"../rust/tests/data/delta-0.8.0-partitioned\")\n>>> dt.schema().to_pyarrow()\nvalue: string\nyear: string\nmonth: string\nday: string\n>>> dt.to_pandas(partitions=[(\"year\", \"=\", \"2021\")], columns=[\"value\"])\n      value\n0     6\n1     7\n2     5\n3     4\n>>> dt.to_pyarrow_table(partitions=[(\"year\", \"=\", \"2021\")], columns=[\"value\"])\npyarrow.Table\nvalue: string\n

    Converting to a PyArrow Dataset allows you to filter on columns other than partition columns and load the result as a stream of batches rather than a single table. Convert to a dataset using DeltaTable.to_pyarrow_dataset. Filters applied to datasets will use the partition values and file statistics from the Delta transaction log and push down any other filters to the scanning operation.

    >>> import pyarrow.dataset as ds\n>>> dataset = dt.to_pyarrow_dataset()\n>>> condition = (ds.field(\"year\") == \"2021\") & (ds.field(\"value\") > \"4\")\n>>> dataset.to_table(filter=condition, columns=[\"value\"]).to_pandas()\n  value\n0     6\n1     7\n2     5\n>>> batch_iter = dataset.to_batches(filter=condition, columns=[\"value\"], batch_size=2)\n>>> for batch in batch_iter: print(batch.to_pandas())\n  value\n0     6\n1     7\n  value\n0     5\n

    PyArrow datasets may also be passed to compatible query engines, such as DuckDB

    >>> import duckdb\n>>> ex_data = duckdb.arrow(dataset)\n>>> ex_data.filter(\"year = 2021 and value > 4\").project(\"value\")\n---------------------\n-- Expression Tree --\n---------------------\nProjection [value]\n  Filter [year=2021 AND value>4]\n    arrow_scan(140409099470144, 4828104688, 1000000)\n\n---------------------\n-- Result Columns  --\n---------------------\n- value (VARCHAR)\n\n---------------------\n-- Result Preview  --\n---------------------\nvalue\nVARCHAR\n[ Rows: 3]\n6\n7\n5\n

    Finally, you can always pass the list of file paths to an engine. For example, you can pass them to dask.dataframe.read_parquet:

    >>> import dask.dataframe as dd\n>>> df = dd.read_parquet(dt.file_uris())\n>>> df\nDask DataFrame Structure:\n                value             year            month              day\nnpartitions=6\n               object  category[known]  category[known]  category[known]\n                  ...              ...              ...              ...\n...               ...              ...              ...              ...\n                  ...              ...              ...              ...\n                  ...              ...              ...              ...\nDask Name: read-parquet, 6 tasks\n>>> df.compute()\n  value  year month day\n0     1  2020     1   1\n0     2  2020     2   3\n0     3  2020     2   5\n0     4  2021     4   5\n0     5  2021    12   4\n0     6  2021    12  20\n1     7  2021    12  20\n
    "},{"location":"usage/read-cdf/","title":"Reading the Change Data Feed from a Delta Table","text":"

    Reading the CDF data from a table with change data is easy.

    "},{"location":"usage/read-cdf/#reading-cdf-log","title":"Reading CDF Log","text":"Python Rust
    import polars\nfrom deltalake import DeltaTable\n\ndt = DeltaTable(\"../rust/tests/data/cdf-table\")\ntable = dt.load_cdf(starting_version=0, ending_version=4).read_all()\npt = polars.from_arrow(table)\npt.group_by(\"_commit_version\").len().sort(\"len\", descending=True)\n
    #[tokio::main]\nasync fn main() -> Result<(), Box<dyn std::error::Error>> {\n\n    let table = deltalake::open_table(\"../rust/tests/data/cdf-table\").await?;\n    let ops = DeltaOps(table);\n    let cdf = ops.load_cdf()\n        .with_starting_version(0)\n        .with_ending_version(4)\n        .build()\n        .await?;\n\n    arrow_cast::pretty::print_batches(&cdf)?;\n\n    Ok(())\n}\n

    The output can then be used in various execution engines. The python example shows how one might consume the cdf feed inside polars.

    "},{"location":"usage/optimize/delta-lake-z-order/","title":"Delta Lake Z Order","text":"

    This section explains how to Z Order a Delta table.

    Z Ordering colocates similar data in the same files, which allows for better file skipping and faster queries.

    Suppose you have a table with first_name, age, and country columns.

    If you Z Order the data by the country column, then individuals from the same country will be stored in the same files. When you subquently query the data for individuals from a given country, it will execute faster because more data can be skipped.

    Here's how to Z Order a Delta table:

    dt = DeltaTable(\"tmp\")\ndt.optimize.z_order([country])\n
    "},{"location":"usage/optimize/small-file-compaction-with-optimize/","title":"Delta Lake small file compaction with optimize","text":"

    This post shows you how to perform small file compaction with using the optimize method. This was added to the DeltaTable class in version 0.9.0. This command rearranges the small files into larger files which will reduce the number of files and speed up queries.

    This is very helpful for workloads that append frequently. For example, if you have a table that is appended to every 10 minutes, after a year you will have 52,560 files in the table. If the table is partitioned by another dimension, you will have 52,560 files per partition; with just 100 unique values that's millions of files. By running optimize periodically, you can reduce the number of files in the table to a more manageable number.

    Typically, you will run optimize less frequently than you append data. If possible, you might run optimize once you know you have finished writing to a particular partition. For example, on a table partitioned by date, you might append data every 10 minutes, but only run optimize once a day at the end of the day. This will ensure you don't need to compact the same data twice.

    This section will also teach you about how to use vacuum to physically remove files from storage that are no longer needed. You\u2019ll often want vacuum after running optimize to remove the small files from storage once they\u2019ve been compacted into larger files.

    Let\u2019s start with an example to explain these key concepts. All the code covered in this post is stored in this notebook in case you\u2019d like to follow along.

    "},{"location":"usage/optimize/small-file-compaction-with-optimize/#create-a-delta-table-with-small-files","title":"Create a Delta table with small files","text":"

    Let\u2019s start by creating a Delta table with a lot of small files so we can demonstrate the usefulness of the optimize command.

    Start by writing a function that generates on thousand rows of random data given a timestamp.

    def record_observations(date: datetime) -> pa.Table:\n    \"\"\"Pulls data for a certain datetime\"\"\"\n    nrows = 1000\n    return pa.table(\n        {\n            \"date\": pa.array([date.date()] * nrows),\n            \"timestamp\": pa.array([date] * nrows),\n            \"value\": pc.random(nrows),\n        }\n    )\n

    Let\u2019s run this function and observe the output:

    record_observations(datetime(2021, 1, 1, 12)).to_pandas()\n\n    date                timestamp   value\n0   2021-01-01  2021-01-01 12:00:00 0.3186397383362023\n1   2021-01-01  2021-01-01 12:00:00 0.04253766974259088\n2   2021-01-01  2021-01-01 12:00:00 0.9355682965171573\n\u2026\n999 2021-01-01  2021-01-01 12:00:00 0.23207037062879843\n

    Let\u2019s write 100 hours worth of data to the Delta table.

    # Every hour starting at midnight on 2021-01-01\nhours_iter = (datetime(2021, 1, 1) + timedelta(hours=i) for i in itertools.count())\n\n# Write 100 hours worth of data\nfor timestamp in itertools.islice(hours_iter, 100):\n    write_deltalake(\n        \"observation_data\",\n        record_observations(timestamp),\n        partition_by=[\"date\"],\n        mode=\"append\",\n    )\n

    This data was appended to the Delta table in 100 separate transactions, so the table will contain 100 transaction log entries and 100 data files. You can see the number of files with the files() method.

    dt = DeltaTable(\"observation_data\")\nlen(dt.files()) # 100\n

    Here\u2019s how the files are persisted in storage.

    observation_data\n\u251c\u2500\u2500 _delta_log\n\u2502   \u251c\u2500\u2500 00000000000000000000.json\n\u2502   \u251c\u2500\u2500 \u2026\n\u2502   \u2514\u2500\u2500 00000000000000000099.json\n\u251c\u2500\u2500 date=2021-01-01\n\u2502   \u251c\u2500\u2500 0-cfe227c6-edd9-4369-a1b0-db4559a2e693-0.parquet\n\u2502   \u251c\u2500\u2500 \u2026\n\u2502   \u251c\u2500\u2500 23-a4ace29e-e73e-40a1-81d3-0f5dc13093de-0.parquet\n\u251c\u2500\u2500 date=2021-01-02\n\u2502   \u251c\u2500\u2500 24-9698b456-66eb-4075-8732-fe56d81edb60-0.parquet\n\u2502   \u251c\u2500\u2500 \u2026\n\u2502   \u2514\u2500\u2500 47-d3fce527-e018-4c02-8acd-a649f6f523d2-0.parquet\n\u251c\u2500\u2500 date=2021-01-03\n\u2502   \u251c\u2500\u2500 48-fd90a7fa-5a14-42ed-9f59-9fe48d87899d-0.parquet\n\u2502   \u251c\u2500\u2500 \u2026\n\u2502   \u2514\u2500\u2500 71-5f143ade-8ae2-4854-bdc5-61154175665f-0.parquet\n\u251c\u2500\u2500 date=2021-01-04\n\u2502   \u251c\u2500\u2500 72-477c10fe-dc09-4087-80f0-56006e4a7911-0.parquet\n\u2502   \u251c\u2500\u2500 \u2026\n\u2502   \u2514\u2500\u2500 95-1c92cbce-8af4-4fe4-9c11-832245cf4d40-0.parquet\n\u2514\u2500\u2500 date=2021-01-05\n    \u251c\u2500\u2500 96-1b878ee5-25fd-431a-bc3e-6dcacc96b470-0.parquet\n    \u251c\u2500\u2500 \u2026\n    \u2514\u2500\u2500 99-9650ed63-c195-433d-a86b-9469088c14ba-0.parquet\n

    Each of these Parquet files are tiny - they\u2019re only 10 KB. Let\u2019s see how to compact these tiny files into larger files, which is more efficient for data queries.

    "},{"location":"usage/optimize/small-file-compaction-with-optimize/#compact-small-files-in-the-delta-table-with-optimize","title":"Compact small files in the Delta table with optimize","text":"

    Let\u2019s run the optimize command to compact the existing small files into larger files:

    dt = DeltaTable(\"observation_data\")\n\ndt.optimize()\n

    Here\u2019s the output of the command:

    {'numFilesAdded': 5,\n 'numFilesRemoved': 100,\n 'filesAdded': {'min': 39000,\n  'max': 238282,\n  'avg': 198425.6,\n  'totalFiles': 5,\n  'totalSize': 992128},\n 'filesRemoved': {'min': 10244,\n  'max': 10244,\n  'avg': 10244.0,\n  'totalFiles': 100,\n  'totalSize': 1024400},\n 'partitionsOptimized': 5,\n 'numBatches': 1,\n 'totalConsideredFiles': 100,\n 'totalFilesSkipped': 0,\n 'preserveInsertionOrder': True}\n

    The optimize operation has added 5 new files and marked 100 exisitng files for removal (this is also known as \u201ctombstoning\u201d files). It has compacted the 100 tiny files into 5 larger files.

    Let\u2019s append some more data to the Delta table and see how we can selectively run optimize on the new data that\u2019s added.

    "},{"location":"usage/optimize/small-file-compaction-with-optimize/#handling-incremental-updates-with-optimize","title":"Handling incremental updates with optimize","text":"

    Let\u2019s append another 24 hours of data to the Delta table:

    for timestamp in itertools.islice(hours_iter, 24):\n    write_deltalake(\n        dt,\n        record_observations(timestamp),\n        partition_by=[\"date\"],\n        mode=\"append\",\n    )\n

    We can use get_add_actions() to introspect the table state. We can see that 2021-01-06 has only a few hours of data so far, so we don't want to optimize that yet. But 2021-01-05 has all 24 hours of data, so it's ready to be optimized.

    dt.get_add_actions(flatten=True).to_pandas()[\n    \"partition.date\"\n].value_counts().sort_index()\n\n2021-01-01     1\n2021-01-02     1\n2021-01-03     1\n2021-01-04     1\n2021-01-05    21\n2021-01-06     4\n

    To optimize a single partition, you can pass in a partition_filters argument speficying which partitions to optimize.

    dt.optimize(partition_filters=[(\"date\", \"=\", \"2021-01-05\")])\n\n{'numFilesAdded': 1,\n 'numFilesRemoved': 21,\n 'filesAdded': {'min': 238282,\n  'max': 238282,\n  'avg': 238282.0,\n  'totalFiles': 1,\n  'totalSize': 238282},\n 'filesRemoved': {'min': 10244,\n  'max': 39000,\n  'avg': 11613.333333333334,\n  'totalFiles': 21,\n  'totalSize': 243880},\n 'partitionsOptimized': 1,\n 'numBatches': 1,\n 'totalConsideredFiles': 21,\n 'totalFilesSkipped': 0,\n 'preserveInsertionOrder': True}\n

    This optimize operation tombstones 21 small data files and adds one file with all the existing data properly condensed. Let\u2019s take a look a portion of the _delta_log/00000000000000000125.json file, which is the transaction log entry that corresponds with this incremental optimize command.

    {\n  \"remove\": {\n    \"path\": \"date=2021-01-05/part-00000-41178aab-2491-488f-943d-8f03867295ee-c000.snappy.parquet\",\n    \"deletionTimestamp\": 1683465499480,\n    \"dataChange\": false,\n    \"extendedFileMetadata\": null,\n    \"partitionValues\": {\n      \"date\": \"2021-01-05\"\n    },\n    \"size\": 39000,\n    \"tags\": null\n  }\n}\n\n{\n  \"remove\": {\n    \"path\": \"date=2021-01-05/101-79ae6fc9-c0cc-49ec-bb94-9aba879ac949-0.parquet\",\n    \"deletionTimestamp\": 1683465499481,\n    \"dataChange\": false,\n    \"extendedFileMetadata\": null,\n    \"partitionValues\": {\n      \"date\": \"2021-01-05\"\n    },\n    \"size\": 10244,\n    \"tags\": null\n  }\n}\n\n\u2026\n\n{\n  \"add\": {\n    \"path\": \"date=2021-01-05/part-00000-4b020a40-c836-4a11-851f-4691370c9f3a-c000.snappy.parquet\",\n    \"size\": 238282,\n    \"partitionValues\": {\n      \"date\": \"2021-01-05\"\n    },\n    \"modificationTime\": 1683465499493,\n    \"dataChange\": false,\n    \"stats\": \"{\\\"numRecords\\\":24000,\\\"minValues\\\":{\\\"value\\\":0.00005581532256615507,\\\"timestamp\\\":\\\"2021-01-05T00:00:00.000Z\\\"},\\\"maxValues\\\":{\\\"timestamp\\\":\\\"2021-01-05T23:00:00.000Z\\\",\\\"value\\\":0.9999911402868216},\\\"nullCount\\\":{\\\"timestamp\\\":0,\\\"value\\\":0}}\",\n    \"tags\": null\n  }\n}\n

    The trasaction log indicates that many files have been tombstoned and one file is added, as expected.

    The Delta Lake optimize command \u201cremoves\u201d data by marking the data files as removed in the transaction log. The optimize command doesn\u2019t physically delete the Parquet file from storage. Optimize performs a \u201clogical remove\u201d not a \u201cphysical remove\u201d.

    Delta Lake uses logical operations so you can time travel back to earlier versions of your data. You can vacuum your Delta table to physically remove Parquet files from storage if you don\u2019t need to time travel and don\u2019t want to pay to store the tombstoned files.

    "},{"location":"usage/optimize/small-file-compaction-with-optimize/#vacuuming-after-optimizing","title":"Vacuuming after optimizing","text":"

    The vacuum command deletes all files from storage that are marked for removal in the transaction log and older than the retention period which is 7 days by default.

    It\u2019s normally a good idea to have a retention period of at least 7 days. For purposes of this example, we will set the retention period to zero, just so you can see how the files get removed from storage. Adjusting the retention period in this manner isn\u2019t recommended for production use cases.

    Let\u2019s run the vacuum command:

    dt.vacuum(retention_hours=0, enforce_retention_duration=False, dry_run=False)\n

    The command returns a list of all the files that are removed from storage:

    ['date=2021-01-02/39-a98680f2-0e0e-4f26-a491-18b183f9eb05-0.parquet',\n 'date=2021-01-02/41-e96bc8bb-c571-484c-b534-e897424fb7da-0.parquet',\n \u2026\n 'date=2021-01-01/0-cfe227c6-edd9-4369-a1b0-db4559a2e693-0.parquet',\n 'date=2021-01-01/18-ded53418-172b-4e40-bf2e-7c8142e71bd1-0.parquet']\n

    Let\u2019s look at the content of the Delta table now that all the really small files have been removed from storage:

    observation_data\n\u251c\u2500\u2500 _delta_log\n\u2502   \u251c\u2500\u2500 00000000000000000000.json\n\u2502   \u251c\u2500\u2500 00000000000000000001.json\n\u2502   \u251c\u2500\u2500 \u2026\n\u2502   \u251c\u2500\u2500 00000000000000000124.json\n\u2502   \u2514\u2500\u2500 00000000000000000125.json\n\u251c\u2500\u2500 date=2021-01-01\n\u2502   \u2514\u2500\u2500 part-00000-31e3df5a-8bbe-425c-b85d-77794f922837-c000.snappy.parquet\n\u251c\u2500\u2500 date=2021-01-02\n\u2502   \u2514\u2500\u2500 part-00000-8af07878-b179-49ce-a900-d58595ffb60a-c000.snappy.parquet\n\u251c\u2500\u2500 date=2021-01-03\n\u2502   \u2514\u2500\u2500 part-00000-5e980864-b32f-4686-a58d-a75fae455c1e-c000.snappy.parquet\n\u251c\u2500\u2500 date=2021-01-04\n\u2502   \u2514\u2500\u2500 part-00000-1e82d23b-084d-47e3-9790-d68289c39837-c000.snappy.parquet\n\u251c\u2500\u2500 date=2021-01-05\n\u2502   \u2514\u2500\u2500 part-00000-4b020a40-c836-4a11-851f-4691370c9f3a-c000.snappy.parquet\n\u2514\u2500\u2500 date=2021-01-06\n    \u251c\u2500\u2500 121-0ecb5d70-4a28-4cd4-b2d2-89ee2285eaaa-0.parquet\n    \u251c\u2500\u2500 122-6b2d2758-9154-4392-b287-fe371ee507ec-0.parquet\n    \u251c\u2500\u2500 123-551d318f-4968-441f-83fc-89f98cd15daf-0.parquet\n    \u2514\u2500\u2500 124-287309d3-662e-449d-b4da-2e67b7cc0557-0.parquet\n

    All the partitions only contain a single file now, except for the date=2021-01-06 partition that has not been compacted yet.

    An entire partition won\u2019t necessarily get compacted to a single data file when optimize is run. Each partition has data files that are condensed to the target file size.

    "},{"location":"usage/optimize/small-file-compaction-with-optimize/#what-causes-the-small-file-problem","title":"What causes the small file problem?","text":"

    Delta tables can accumulate small files for a variety of reasons:

    • User error: users can accidentally write files that are too small. Users should sometimes repartition in memory before writing to disk to avoid appending files that are too small.
    • Frequent appends: systems that append more often tend to append more smaller files. A pipeline that appends every minute will generally generate ten times as many small files compared to a system that appends every ten minutes.
    • Appending to partitioned data lakes with high cardinality columns can also cause small files. If you append every hour to a table that\u2019s partitioned on a column with 1,000 distinct values, then every append could create 1,000 new files. Partitioning by date avoids this problem because the data isn\u2019t split up across partitions in this manner.
    "},{"location":"usage/optimize/small-file-compaction-with-optimize/#conclusion","title":"Conclusion","text":"

    This page showed you how to create a Delta table with many small files, compact the small files into larger files with optimize, and remove the tombstoned files from storage with vacuum.

    You also learned about how to incrementally optimize partitioned Delta tables, so you only compact newly added data.

    An excessive number of small files slows down Delta table queries, so periodic compaction is important. Make sure to properly maintain your Delta tables, so performance does not degrade over time.

    "},{"location":"usage/writing/","title":"Writing Delta Tables","text":"

    For overwrites and appends, use write_deltalake. If the table does not already exist, it will be created. The data parameter will accept a Pandas DataFrame, a PyArrow Table, or an iterator of PyArrow Record Batches.

    >>> from deltalake import write_deltalake\n>>> df = pd.DataFrame({'x': [1, 2, 3]})\n>>> write_deltalake('path/to/table', df)\n

    Note: write_deltalake accepts a Pandas DataFrame, but will convert it to a Arrow table before writing. See caveats in pyarrow:python/pandas.

    By default, writes create a new table and error if it already exists. This is controlled by the mode parameter, which mirrors the behavior of Spark's pyspark.sql.DataFrameWriter.saveAsTable DataFrame method. To overwrite pass in mode='overwrite' and to append pass in mode='append':

    >>> write_deltalake('path/to/table', df, mode='overwrite')\n>>> write_deltalake('path/to/table', df, mode='append')\n

    write_deltalake will raise ValueError if the schema of the data passed to it differs from the existing table's schema. If you wish to alter the schema as part of an overwrite pass in schema_mode=\"overwrite\" or schema_mode=\"merge\". schema_mode=\"overwrite\" will completely overwrite the schema, even if columns are dropped; merge will append the new columns and fill missing columns with null. schema_mode=\"merge\" is also supported on append operations.

    "},{"location":"usage/writing/#overwriting-a-partition","title":"Overwriting a partition","text":"

    You can overwrite a specific partition by using mode=\"overwrite\" together with partition_filters. This will remove all files within the matching partition and insert your data as new files. This can only be done on one partition at a time. All of the input data must belong to that partition or else the method will raise an error.

    >>> from deltalake import write_deltalake\n>>> df = pd.DataFrame({'x': [1, 2, 3], 'y': ['a', 'a', 'b']})\n>>> write_deltalake('path/to/table', df, partition_by=['y'])\n\n>>> table = DeltaTable('path/to/table')\n>>> df2 = pd.DataFrame({'x': [100], 'y': ['b']})\n>>> write_deltalake(table, df2, partition_filters=[('y', '=', 'b')], mode=\"overwrite\")\n\n>>> table.to_pandas()\n     x  y\n0    1  a\n1    2  a\n2  100  b\n

    This method could also be used to insert a new partition if one doesn't already exist, making this operation idempotent.

    "},{"location":"usage/writing/#overwriting-part-of-the-table-data-using-a-predicate","title":"Overwriting part of the table data using a predicate","text":"

    Note

    This predicate is often called a replaceWhere predicate

    When you don\u2019t specify the predicate, the overwrite save mode will replace the entire table. Instead of replacing the entire table (which is costly!), you may want to overwrite only the specific parts of the table that should be changed. In this case, you can use a predicate to overwrite only the relevant records or partitions.

    Note

    Data written must conform to the same predicate, i.e. not contain any records that don't match the predicate condition, otherwise the operation will fail

    Python Rust

    replaceWhere

    import pyarrow as pa\nfrom deltalake import write_deltalake\n\n# Assuming there is already a table in this location with some records where `id = '1'` which we want to overwrite\ntable_path = \"/tmp/my_table\"\ndata = pa.table(\n    {\n        \"id\": pa.array([\"1\", \"1\"], pa.string()),\n        \"value\": pa.array([11, 12], pa.int64()),\n    }\n)\nwrite_deltalake(\n    table_path,\n    data,\n    mode=\"overwrite\",\n    predicate=\"id = '1'\",\n    engine=\"rust\",\n)\n

    replaceWhere

    // Assuming there is already a table in this location with some records where `id = '1'` which we want to overwrite\nuse arrow_array::RecordBatch;\nuse arrow_schema::{DataType, Field, Schema as ArrowSchema};\nuse deltalake::datafusion::logical_expr::{col, lit};\nuse deltalake::protocol::SaveMode;\nuse deltalake::DeltaOps;\n\nlet schema = ArrowSchema::new(vec![\n    Field::new(\"id\", DataType::Utf8, true),\n    Field::new(\"value\", DataType::Int32, true),\n]);\n\nlet data = RecordBatch::try_new(\n    schema.into(),\n    vec![\n        Arc::new(arrow::array::StringArray::from(vec![\"1\", \"1\"])),\n        Arc::new(arrow::array::Int32Array::from(vec![11, 12])),\n    ],\n)\n.unwrap();\n\nlet table = deltalake::open_table(\"/tmp/my_table\").await.unwrap();\nlet _table = DeltaOps(table)\n    .write(vec![data])\n    .with_save_mode(SaveMode::Overwrite)\n    .with_replace_where(col(\"id\").eq(lit(\"1\")))\n    .await\n    .unwrap();\n

    "},{"location":"usage/writing/writing-to-s3-with-locking-provider/","title":"Writing to S3 with a locking provider","text":"

    Delta lake guarantees ACID transactions when writing data. This is done by default when writing to all supported object stores except AWS S3. (Some S3 clients like CloudFlare R2 or MinIO may enable concurrent writing without a locking provider, refer to this section for more information).

    When writing to S3, delta-rs provides a locking mechanism to ensure that concurrent writes are safe. This is done by default when writing to S3, but you can opt-out by setting the AWS_S3_ALLOW_UNSAFE_RENAME variable to true.

    To enable safe concurrent writes to AWS S3, we must provide an external locking mechanism.

    "},{"location":"usage/writing/writing-to-s3-with-locking-provider/#dynamodb","title":"DynamoDB","text":"

    DynamoDB is the only available locking provider at the moment in delta-rs. To enable DynamoDB as the locking provider, you need to set the AWS_S3_LOCKING_PROVIDER to 'dynamodb' as a storage_options or as an environment variable.

    Additionally, you must create a DynamoDB table with the name delta_log so that it can be automatically recognized by delta-rs. Alternatively, you can use a table name of your choice, but you must set the DELTA_DYNAMO_TABLE_NAME variable to match your chosen table name. The required schema for the DynamoDB table is as follows:

    \"Table\": {\n    \"AttributeDefinitions\": [\n        {\n            \"AttributeName\": \"fileName\",\n            \"AttributeType\": \"S\"\n        },\n        {\n            \"AttributeName\": \"tablePath\",\n            \"AttributeType\": \"S\"\n        }\n    ],\n    \"TableName\": \"delta_log\",\n    \"KeySchema\": [\n        {\n            \"AttributeName\": \"tablePath\",\n            \"KeyType\": \"HASH\"\n        },\n        {\n            \"AttributeName\": \"fileName\",\n            \"KeyType\": \"RANGE\"\n        }\n    ],\n}\n

    Here is an example writing to s3 using this mechanism:

    from deltalake import write_deltalake\ndf = pd.DataFrame({'x': [1, 2, 3]})\nstorage_options = {\n    'AWS_S3_LOCKING_PROVIDER': 'dynamodb',\n    'DELTA_DYNAMO_TABLE_NAME': 'custom_table_name'\n}\nwrite_deltalake(\n    's3a://path/to/table',\n    df,\n    storage_options=storage_options\n)\n

    This locking mechanism is compatible with the one used by Apache Spark. The tablePath property, denoting the root url of the delta table itself, is part of the primary key, and all writers intending to write to the same table must match this property precisely. In Spark, S3 URLs are prefixed with s3a://, and a table in delta-rs must be configured accordingly.

    Note that delta-rs does not read credentials from your local .aws/config or .aws/creds file. Credentials can be accessed from environment variables, ec2 metadata, profiles or web identity. You can pass credentials to storage_options using AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.

    The following code allows creating the necessary DynamoDB table from the AWS cli:

    aws dynamodb create-table \\\n--table-name delta_log \\\n--attribute-definitions AttributeName=tablePath,AttributeType=S AttributeName=fileName,AttributeType=S \\\n--key-schema AttributeName=tablePath,KeyType=HASH AttributeName=fileName,KeyType=RANGE \\\n--provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5\n

    You can find additional information in the Delta Lake documentation, which also includes recommendations on configuring a time-to-live (TTL) for the table to avoid growing the table indefinitely.

    "},{"location":"usage/writing/writing-to-s3-with-locking-provider/#enable-unsafe-writes-in-s3-opt-in","title":"Enable unsafe writes in S3 (opt-in)","text":"

    If for some reason you don't want to use dynamodb as your locking mechanism you can choose to set the AWS_S3_ALLOW_UNSAFE_RENAME variable to true in order to enable S3 unsafe writes.

    "},{"location":"usage/writing/writing-to-s3-with-locking-provider/#required-permissions","title":"Required permissions","text":"

    You need to have permissions to get, put and delete objects in the S3 bucket you're storing your data in. Please note that you must be allowed to delete objects even if you're just appending to the deltalake, because there are temporary files into the log folder that are deleted after usage.

    In AWS, those would be the required permissions:

    • s3:GetObject
    • s3:PutObject
    • s3:DeleteObject

    In DynamoDB, you need those permissions:

    • dynamodb:GetItem
    • dynamodb:Query
    • dynamodb:PutItem
    • dynamodb:UpdateItem
    "},{"location":"usage/writing/writing-to-s3-with-locking-provider/#enabling-concurrent-writes-for-alternative-clients","title":"Enabling concurrent writes for alternative clients","text":"

    Unlike AWS S3, some S3 clients support atomic renames by passing some headers in requests.

    For CloudFlare R2 passing this in the storage_options will enable concurrent writes:

    storage_options = {\n    \"copy_if_not_exists\": \"header: cf-copy-destination-if-none-match: *\",\n}\n

    Something similar can be done with MinIO but the header to pass should be verified in the MinIO documentation.

    "}]} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml index dd42031de1..649a9576a0 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -2,197 +2,202 @@ https://github.com/delta-io/delta-rs/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/delta-lake-best-practices/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/delta-lake-big-data-small-data/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/why-use-delta-lake/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/api/catalog/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/api/delta_writer/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/api/exceptions/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/api/schema/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/api/storage/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/api/delta_table/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/api/delta_table/delta_table_alterer/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/api/delta_table/delta_table_merger/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/api/delta_table/delta_table_optimizer/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/api/delta_table/metadata/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/how-delta-lake-works/architecture-of-delta-table/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/how-delta-lake-works/delta-lake-acid-transactions/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/how-delta-lake-works/delta-lake-file-skipping/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/integrations/delta-lake-arrow/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/integrations/delta-lake-daft/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/integrations/delta-lake-dagster/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/integrations/delta-lake-dask/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/integrations/delta-lake-datafusion/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/integrations/delta-lake-pandas/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/integrations/delta-lake-polars/ - 2024-05-11 + 2024-07-03 + daily + + + https://github.com/delta-io/delta-rs/integrations/object-storage/hdfs/ + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/appending-overwriting-delta-lake-table/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/constraints/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/create-delta-lake-table/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/deleting-rows-from-delta-lake-table/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/examining-table/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/installation/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/loading-table/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/managing-tables/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/overview/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/querying-delta-tables/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/read-cdf/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/optimize/delta-lake-z-order/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/optimize/small-file-compaction-with-optimize/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/writing/ - 2024-05-11 + 2024-07-03 daily https://github.com/delta-io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/ - 2024-05-11 + 2024-07-03 daily \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz index b117b35ed03cb7c9644fa4e10552a06a5cfc121c..9d193393e3afae498eb3bc297b77049a845a2f7d 100644 GIT binary patch literal 619 zcmV-x0+jt9iwFoW_=IKx|8r?{Wo=<_E_iKh0M(hzZrd;n$M1cLz<0%2J76D3+}oaD zI~R;h+gfBxrQ|s2(~q*<)EHI_*kLezi6k(>?@vi2W$zy^t=*iFf^pp;`>@?*8|<_z zO;;bXAK$;J&)H*fx9`aYj2kxY!Xay7^k4IQ7=~>y5nJ$K>!{9yZnzWjc`)B?zhuST zhMzRS)Eb|?b-GBi1fHfgvFVRn?b^IV8$lU2Hz?%$Jee;42_d$wEbe#rAJy)Y+C8wz zJhm{?4RkeDgwI6@#$NaNVq`+fZ%qG8H#IDrP#rNusYNg;Wu)KRf+RPh~8sa5CQW&0ytPOc${xnB2Zx6CyVVWiO$MV$tic{#x>V z`fw799-->f&0eSg4;{N-D!j8mqTt=}C#PcdTKhL;wRZeIBv9}kyY$9NxieC_xu*7F zDI^pWlN8PCz*Pe;iwxImLWn5S*WAT%r*KseR7I|psRJ+2nl1^E!)$n)gSBJe5dmx+ zl(JmhLI?E?Sw=WX@%wCW5CQw#BVFGzjUYF3&@PoOD;axYelN4EtzML)a`jh=-}t8w zEaOF*?fvv#Kn*;Dy4gq&G#6N+SYM06=u~)^e9kkS(vT;2HveTajXwcn@e7L4x>a@^ F002l>DZc;! literal 606 zcmV-k0-^mMiwFo#VLxU9|8r?{Wo=<_E_iKh0M(hnj+-zLhVT0n5qF1V+e&Sfu(v)z zdtOb<09Fk4j!j7R={p9RMO9aodZ-vKv8{>q{ewL-HixHcYd069U|e_1_S;>y!A`r< zboDX&@%@|noIMo}hn{S}xMAZi9J3}y|25BtVb}%}u>~Ktj_N$B?t?@Znr;9X8;Av_ToBp)buFXrd5tMOrgF=4Dlj-uG5Mt}f;&J!*QSCmd{hm$c zv4xp#psTSWd?`vW_IAh@BNI}7XZmNlsbS%a>VzRGPoSgGD2dzDss!$Z7OWkcD~BJN zo9Y9qHxVlb25)lBezmR^Kst7w2jV?U0&wxF(MOJ6K>+L&ASHlCM^K}f6C;+39b3Dq~TFv+ro6b>J0R(X@27*c8h8eE sw~-)dF0e$g{wfNiQ{ltpbDrs(hCI28`7euU{0SI~pS{s|qqiIY0P>tEc>n+a diff --git a/usage/appending-overwriting-delta-lake-table/index.html b/usage/appending-overwriting-delta-lake-table/index.html index 9d4de76929..e6a6a4971e 100644 --- a/usage/appending-overwriting-delta-lake-table/index.html +++ b/usage/appending-overwriting-delta-lake-table/index.html @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1358,6 +1367,8 @@ + + @@ -1401,6 +1412,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • diff --git a/usage/constraints/index.html b/usage/constraints/index.html index 04bd265852..7cb4445d09 100644 --- a/usage/constraints/index.html +++ b/usage/constraints/index.html @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1358,6 +1367,8 @@ + + @@ -1401,6 +1412,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • diff --git a/usage/create-delta-lake-table/index.html b/usage/create-delta-lake-table/index.html index 723663f928..7fb2536766 100644 --- a/usage/create-delta-lake-table/index.html +++ b/usage/create-delta-lake-table/index.html @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1314,6 +1323,8 @@ + + @@ -1357,6 +1368,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • diff --git a/usage/deleting-rows-from-delta-lake-table/index.html b/usage/deleting-rows-from-delta-lake-table/index.html index 72a9035bea..93aab3d611 100644 --- a/usage/deleting-rows-from-delta-lake-table/index.html +++ b/usage/deleting-rows-from-delta-lake-table/index.html @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1314,6 +1323,8 @@ + + @@ -1357,6 +1368,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • @@ -1675,6 +1756,8 @@

    Deleting rows from a Delta Lake t | 2 | b | +-------+----------+

  • +

    dt.delete() accepts any SQL where clause. If no predicate is provided, all rows will be deleted.

    +

    Read more in the API docs

    diff --git a/usage/examining-table/index.html b/usage/examining-table/index.html index a76c5d7d39..1380464dbd 100644 --- a/usage/examining-table/index.html +++ b/usage/examining-table/index.html @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1372,6 +1381,8 @@ + + @@ -1415,6 +1426,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • diff --git a/usage/installation/index.html b/usage/installation/index.html index 60fa26a2c0..802c02d0f2 100644 --- a/usage/installation/index.html +++ b/usage/installation/index.html @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1365,6 +1374,8 @@ + + @@ -1408,6 +1419,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • @@ -1757,9 +1838,7 @@

    Run Delta Lake and pand - pandas - polars - jupyterlab - - pip - - pip: - - deltalake + - deltalake

  • Create a virtual environment with the dependencies:

    conda env create -f deltalake-minimal.yml
    diff --git a/usage/loading-table/index.html b/usage/loading-table/index.html
    index bf6bae5a70..41d98f2057 100644
    --- a/usage/loading-table/index.html
    +++ b/usage/loading-table/index.html
    @@ -287,8 +287,14 @@
       
         
         
    +      
    +  
    +  
    +  
    +    
    +    
           
  • - + @@ -300,6 +306,9 @@ + + + @@ -1358,6 +1367,8 @@ + + @@ -1401,6 +1412,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • @@ -1764,6 +1845,7 @@

    Loading a Delta Table

  • s3a://\<bucket>/\<path>
  • +

    Note that delta-rs does not read credentials from a local .aws/config or .aws/creds file. Credentials can be accessed from environment variables, ec2 metadata, profiles or web identity. You can also pass credentials to storage_options using AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.

    Azure:

    @@ -1365,6 +1381,8 @@ + + @@ -1408,6 +1426,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • @@ -1711,6 +1799,13 @@ Required permissions +
  • + +
  • + + Enabling concurrent writes for alternative clients + +
  • @@ -1732,8 +1827,9 @@

    Writing to S3 with a locking provider

    -

    A locking mechanism is needed to prevent unsafe concurrent writes to a -delta lake directory when writing to S3.

    +

    Delta lake guarantees ACID transactions when writing data. This is done by default when writing to all supported object stores except AWS S3. (Some S3 clients like CloudFlare R2 or MinIO may enable concurrent writing without a locking provider, refer to this section for more information).

    +

    When writing to S3, delta-rs provides a locking mechanism to ensure that concurrent writes are safe. This is done by default when writing to S3, but you can opt-out by setting the AWS_S3_ALLOW_UNSAFE_RENAME variable to true.

    +

    To enable safe concurrent writes to AWS S3, we must provide an external locking mechanism.

    DynamoDB

    DynamoDB is the only available locking provider at the moment in delta-rs. To enable DynamoDB as the locking provider, you need to set the AWS_S3_LOCKING_PROVIDER to 'dynamodb' as a storage_options or as an environment variable.

    Additionally, you must create a DynamoDB table with the name delta_log @@ -1768,32 +1864,54 @@

    DynamoDB

    Here is an example writing to s3 using this mechanism:

    from deltalake import write_deltalake
     df = pd.DataFrame({'x': [1, 2, 3]})
    -storage_options = {'AWS_S3_LOCKING_PROVIDER': 'dynamodb', 'DELTA_DYNAMO_TABLE_NAME': 'custom_table_name'}
    -write_deltalake('s3a://path/to/table', df, 'storage_options'= storage_options)
    +storage_options = {
    +    'AWS_S3_LOCKING_PROVIDER': 'dynamodb',
    +    'DELTA_DYNAMO_TABLE_NAME': 'custom_table_name'
    +}
    +write_deltalake(
    +    's3a://path/to/table',
    +    df,
    +    storage_options=storage_options
    +)
     

    This locking mechanism is compatible with the one used by Apache Spark. The tablePath property, denoting the root url of the delta table itself, is part of the primary key, and all writers intending to write to the same table must match this property precisely. In Spark, S3 URLs are prefixed with s3a://, and a table in delta-rs must be configured accordingly.

    -

    The following code allows creating the necessary table from the AWS cli:

    +

    Note that delta-rs does not read credentials from your local .aws/config or .aws/creds file. Credentials can be accessed from environment variables, ec2 metadata, profiles or web identity. You can pass credentials to storage_options using AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.

    +

    The following code allows creating the necessary DynamoDB table from the AWS cli:

    aws dynamodb create-table \
     --table-name delta_log \
     --attribute-definitions AttributeName=tablePath,AttributeType=S AttributeName=fileName,AttributeType=S \
     --key-schema AttributeName=tablePath,KeyType=HASH AttributeName=fileName,KeyType=RANGE \
     --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5
     
    -

    You can find additional information in the delta-rs-documentation, which also includes recommendations on configuring a time-to-live (TTL) for the table to avoid growing the table indefinitely.

    +

    You can find additional information in the Delta Lake documentation, which also includes recommendations on configuring a time-to-live (TTL) for the table to avoid growing the table indefinitely.

    Enable unsafe writes in S3 (opt-in)

    If for some reason you don't want to use dynamodb as your locking mechanism you can choose to set the AWS_S3_ALLOW_UNSAFE_RENAME variable to true in order to enable S3 unsafe writes.

    Required permissions

    You need to have permissions to get, put and delete objects in the S3 bucket you're storing your data in. Please note that you must be allowed to delete objects even if you're just appending to the deltalake, because there are temporary files into the log folder that are deleted after usage.

    -

    In AWS, those would be the required permissions: -- s3:GetObject -- s3:PutObject -- s3:DeleteObject

    -

    In DynamoDB, you need those permissions: -- dynamodb:GetItem -- dynamodb:Query -- dynamodb:PutItem -- dynamodb:UpdateItem

    +

    In AWS, those would be the required permissions:

    +
      +
    • s3:GetObject
    • +
    • s3:PutObject
    • +
    • s3:DeleteObject
    • +
    +

    In DynamoDB, you need those permissions:

    +
      +
    • dynamodb:GetItem
    • +
    • dynamodb:Query
    • +
    • dynamodb:PutItem
    • +
    • dynamodb:UpdateItem
    • +
    +

    Enabling concurrent writes for alternative clients

    +

    Unlike AWS S3, some S3 clients support atomic renames by passing some headers +in requests.

    +

    For CloudFlare R2 passing this in the storage_options will enable concurrent writes:

    +
    storage_options = {
    +    "copy_if_not_exists": "header: cf-copy-destination-if-none-match: *",
    +}
    +
    +

    Something similar can be done with MinIO but the header to pass should be verified +in the MinIO documentation.

    diff --git a/why-use-delta-lake/index.html b/why-use-delta-lake/index.html index b64b81a2d8..a4dd7cde4f 100644 --- a/why-use-delta-lake/index.html +++ b/why-use-delta-lake/index.html @@ -287,8 +287,14 @@ + + + + + +
  • - + @@ -300,6 +306,9 @@ + + + @@ -1393,6 +1402,8 @@ + + @@ -1436,6 +1447,76 @@ + + + + + + + + + +
  • + + + + + + + + + + + + + +
  • + + + + + + + + +
  • @@ -1802,7 +1883,7 @@

    Fast performance

    For data lakes, you need to run file listing operations to get the file paths before you can actually read the data. Listing all the files in a data lake can take a long time, especially if there are a lot of files and they are stored in Hive-style partitions.

    Delta Lake stores all the file paths in the transaction log. So you can quickly get the file paths directly from the log and then run your query. Delta Lake also stores the file-level metadata in the transaction log which is quicker than opening all the files in the data lake and grabbing the metadata from the file footer.

    Developer friendly features

    -

    Many basic data operations are hard in data lakes but quite easy with Delta Lake. The only data operation that’s easy with in data lake is appending data. Delta Lake makes all data operations easy including the following:

    +

    Many basic data operations are hard in data lakes but quite easy with Delta Lake. The only data operation that’s easy with a data lake is appending data. Delta Lake makes all data operations easy including the following:

    • Appends
    • Upserts