dlt-hub · rudolfix · Nov 30, 2024 · Sep 26, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/.github/workflows/test_dbt_runner.yml b/.github/workflows/test_dbt_runner.yml
@@ -60,7 +60,7 @@ jobs:
 
       - name: Install dependencies
         # install dlt with postgres support
-        run: poetry install --no-interaction -E postgres --with sentry-sdk,dbt
+        run: poetry install --no-interaction -E postgres -E postgis --with sentry-sdk,dbt
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml
@@ -78,7 +78,7 @@ jobs:
 
       - name: Install dependencies
         # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake
+        run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml
@@ -48,7 +48,7 @@ jobs:
       # Label used to access the service container
       postgres:
         # Docker Hub image
-        image: postgres
+        image: postgis/postgis
         # Provide the password for postgres
         env:
           POSTGRES_DB: dlt_data
@@ -95,7 +95,7 @@ jobs:
           key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations
 
       - name: Install dependencies
-        run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake
+        run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake
 
       - name: Start SFTP server
         run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d

diff --git a/.github/workflows/test_local_sources.yml b/.github/workflows/test_local_sources.yml
@@ -43,7 +43,7 @@ jobs:
       # Label used to access the service container
       postgres:
         # Docker Hub image
-        image: postgres
+        image: postgis/postgis
         # Provide the password for postgres
         env:
           POSTGRES_DB: dlt_data
@@ -83,7 +83,7 @@ jobs:
 
       # TODO: which deps should we enable?
       - name: Install dependencies
-        run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E sql_database --with sentry-sdk,pipeline,sources
+        run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E sql_database --with sentry-sdk,pipeline,sources
 
       # run sources tests in load against configured destinations
       - run: poetry run pytest tests/load/sources

diff --git a/dlt/common/data_writers/escape.py b/dlt/common/data_writers/escape.py
@@ -1,7 +1,7 @@
-import re
 import base64
-from typing import Any, Dict
+import re
 from datetime import date, datetime, time  # noqa: I251
+from typing import Any, Dict
 
 from dlt.common.json import json
 from dlt.common.pendulum import pendulum
@@ -47,6 +47,17 @@ def escape_redshift_literal(v: Any) -> Any:
     return str(v)
 
 
+# TODO: Find, or implement, a pure python parser to remove the shapely dependency
+def is_valid_wkb(data: bytes) -> bool:
+    try:
+        from shapely import wkb  # type: ignore
+
+        wkb.loads(data)
+        return True
+    except Exception:
+        return False
+
+
 def escape_postgres_literal(v: Any) -> Any:
     if isinstance(v, str):
         # we escape extended string which behave like the redshift string
@@ -56,7 +67,12 @@ def escape_postgres_literal(v: Any) -> Any:
     if isinstance(v, (list, dict)):
         return _escape_extended(json.dumps(v))
     if isinstance(v, bytes):
-        return f"'\\x{v.hex()}'"
+        if is_valid_wkb(v):
+            # Skip \x prefix for WKB (OGC/ISO geometry standard format)
+            # https://libgeos.org/specifications/wkb/
+            return f"'{v.hex()}'"
+        else:
+            return f"'\\x{v.hex()}'"
     if v is None:
         return "NULL"
 

diff --git a/dlt/destinations/impl/postgres/factory.py b/dlt/destinations/impl/postgres/factory.py
@@ -1,19 +1,19 @@
 import typing as t
 
+from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE
 from dlt.common.data_writers.configuration import CsvFormatConfiguration
-from dlt.common.destination import Destination, DestinationCapabilitiesContext
 from dlt.common.data_writers.escape import escape_postgres_identifier, escape_postgres_literal
-from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE
+from dlt.common.destination import Destination, DestinationCapabilitiesContext
 from dlt.common.destination.typing import PreparedTableSchema
 from dlt.common.exceptions import TerminalValueError
 from dlt.common.schema.typing import TColumnSchema, TColumnType
 from dlt.common.wei import EVM_DECIMAL_PRECISION
-
-from dlt.destinations.type_mapping import TypeMapperImpl
 from dlt.destinations.impl.postgres.configuration import (
     PostgresCredentials,
     PostgresClientConfiguration,
 )
+from dlt.destinations.impl.postgres.postgres_adapter import GEOMETRY_HINT, SRID_HINT
+from dlt.destinations.type_mapping import TypeMapperImpl
 
 if t.TYPE_CHECKING:
     from dlt.destinations.impl.postgres.postgres import PostgresClient
@@ -55,6 +55,7 @@ class PostgresTypeMapper(TypeMapperImpl):
         "character varying": "text",
         "smallint": "bigint",
         "integer": "bigint",
+        "geometry": "text",
     }
 
     def to_db_integer_type(self, column: TColumnSchema, table: PreparedTableSchema = None) -> str:
@@ -108,11 +109,18 @@ def to_db_datetime_type(
     def from_destination_type(
         self, db_type: str, precision: t.Optional[int] = None, scale: t.Optional[int] = None
     ) -> TColumnType:
-        if db_type == "numeric":
-            if (precision, scale) == self.capabilities.wei_precision:
-                return dict(data_type="wei")
+        if db_type == "numeric" and (precision, scale) == self.capabilities.wei_precision:
+            return dict(data_type="wei")
+        if db_type.startswith("geometry"):
+            return dict(data_type="text")
         return super().from_destination_type(db_type, precision, scale)
 
+    def to_destination_type(self, column: TColumnSchema, table: PreparedTableSchema) -> str:
+        if column.get(GEOMETRY_HINT):
+            srid = column.get(SRID_HINT, 4326)
+            return f"geometry(Geometry, {srid})"
+        return super().to_destination_type(column, table)
+
 
 class postgres(Destination[PostgresClientConfiguration, "PostgresClient"]):
     spec = PostgresClientConfiguration

diff --git a/dlt/destinations/impl/postgres/postgres.py b/dlt/destinations/impl/postgres/postgres.py
@@ -2,30 +2,28 @@
 
 from dlt.common import logger
 from dlt.common.data_writers.configuration import CsvFormatConfiguration
+from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.destination.exceptions import (
     DestinationInvalidFileFormat,
-    DestinationTerminalException,
 )
 from dlt.common.destination.reference import (
     HasFollowupJobs,
     PreparedTableSchema,
     RunnableLoadJob,
     FollowupJobRequest,
     LoadJob,
-    TLoadJobState,
 )
-from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.exceptions import TerminalValueError
 from dlt.common.schema import TColumnSchema, TColumnHint, Schema
-from dlt.common.schema.typing import TColumnType, TTableFormat
+from dlt.common.schema.typing import TColumnType
 from dlt.common.schema.utils import is_nullable_column
 from dlt.common.storages.file_storage import FileStorage
-
-from dlt.destinations.sql_jobs import SqlStagingCopyFollowupJob, SqlJobParams
-from dlt.destinations.insert_job_client import InsertValuesJobClient
-from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient
 from dlt.destinations.impl.postgres.configuration import PostgresClientConfiguration
+from dlt.destinations.impl.postgres.postgres_adapter import GEOMETRY_HINT
+from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient
+from dlt.destinations.insert_job_client import InsertValuesJobClient
 from dlt.destinations.sql_client import SqlClientBase
+from dlt.destinations.sql_jobs import SqlStagingCopyFollowupJob, SqlJobParams
 
 HINT_TO_POSTGRES_ATTR: Dict[TColumnHint, str] = {"unique": "UNIQUE"}
 
@@ -43,15 +41,16 @@ def generate_sql(
             with sql_client.with_staging_dataset():
                 staging_table_name = sql_client.make_qualified_table_name(table["name"])
             table_name = sql_client.make_qualified_table_name(table["name"])
-            # drop destination table
-            sql.append(f"DROP TABLE IF EXISTS {table_name};")
-            # moving staging table to destination schema
-            sql.append(
-                f"ALTER TABLE {staging_table_name} SET SCHEMA"
-                f" {sql_client.fully_qualified_dataset_name()};"
+            sql.extend(
+                (
+                    f"DROP TABLE IF EXISTS {table_name};",
+                    (
+                        f"ALTER TABLE {staging_table_name} SET SCHEMA"
+                        f" {sql_client.fully_qualified_dataset_name()};"
+                    ),
+                    f"CREATE TABLE {staging_table_name} (like {table_name} including all);",
+                )
             )
-            # recreate staging table
-            sql.append(f"CREATE TABLE {staging_table_name} (like {table_name} including all);")
         return sql
 
 
@@ -111,8 +110,7 @@ def run(self) -> None:
                 split_columns.append(norm_col)
                 if norm_col in split_headers and is_nullable_column(col):
                     split_null_headers.append(norm_col)
-            split_unknown_headers = set(split_headers).difference(split_columns)
-            if split_unknown_headers:
+            if split_unknown_headers := set(split_headers).difference(split_columns):
                 raise DestinationInvalidFileFormat(
                     "postgres",
                     "csv",
@@ -130,15 +128,8 @@ def run(self) -> None:
 
             qualified_table_name = sql_client.make_qualified_table_name(table_name)
             copy_sql = (
-                "COPY %s (%s) FROM STDIN WITH (FORMAT CSV, DELIMITER '%s', NULL '',"
-                " %s ENCODING '%s')"
-                % (
-                    qualified_table_name,
-                    headers,
-                    sep,
-                    null_headers,
-                    csv_format.encoding,
-                )
+                f"COPY {qualified_table_name} ({headers}) FROM STDIN WITH (FORMAT CSV, DELIMITER"
+                f" '{sep}', NULL '', {null_headers} ENCODING '{csv_format.encoding}')"
             )
             with sql_client.begin_transaction():
                 with sql_client.native_connection.cursor() as cursor:
@@ -167,21 +158,30 @@ def __init__(
     def create_load_job(
         self, table: PreparedTableSchema, file_path: str, load_id: str, restore: bool = False
     ) -> LoadJob:
+        if any(
+            column.get(GEOMETRY_HINT) for column in table["columns"].values()
+        ) and not file_path.endswith("insert_values"):
+            # Only insert_values load jobs supported for geom types.
+            # TODO: This isn't actually true, can make it work with geoarrow!
+            raise TerminalValueError(
+                "CSV bulk loading is not supported for tables with geometry columns."
+            )
         job = super().create_load_job(table, file_path, load_id, restore)
         if not job and file_path.endswith("csv"):
             job = PostgresCsvCopyJob(file_path)
         return job
 
     def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str:
-        hints_str = " ".join(
+        hints_ = " ".join(
             self.active_hints.get(h, "")
             for h in self.active_hints.keys()
             if c.get(h, False) is True
         )
         column_name = self.sql_client.escape_column_name(c["name"])
-        return (
-            f"{column_name} {self.type_mapper.to_destination_type(c,table)} {hints_str} {self._gen_not_null(c.get('nullable', True))}"
-        )
+        nullability = self._gen_not_null(c.get("nullable", True))
+        column_type = self.type_mapper.to_destination_type(c, table)
+
+        return f"{column_name} {column_type} {hints_} {nullability}"
 
     def _create_replace_followup_jobs(
         self, table_chain: Sequence[PreparedTableSchema]

diff --git a/dlt/destinations/impl/postgres/postgres_adapter.py b/dlt/destinations/impl/postgres/postgres_adapter.py
@@ -0,0 +1,63 @@
+from typing import Any, Optional
+
+from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns
+from dlt.destinations.utils import get_resource_for_adapter
+from dlt.extract import DltResource
+
+GEOMETRY_HINT = "x-postgres-geometry"
+SRID_HINT = "x-postgres-srid"
+
+
+def postgres_adapter(
+    data: Any,
+    geometry: TColumnNames = None,
+    srid: Optional[int] = 4326,
+) -> DltResource:
+    """Prepares data for the postgres destination by specifying which columns should
+    be cast to PostGIS geometry types.
+
+    Args:
+        data (Any): The data to be transformed. It can be raw data or an instance
+            of DltResource. If raw data, the function wraps it into a DltResource
+            object.
+        geometry (TColumnNames, optional): Specify columns to cast to geometries.
+            It can be a single column name as a string, or a list of column names.
+        srid (int, optional): The Spatial Reference System Identifier (SRID) to be
+            used for the geometry columns. If not provided, SRID 4326 will be used.
+
+    Returns:
+        DltResource: A resource with applied postgres-specific hints.
+
+    Raises:
+        ValueError: If input for `geometry` is invalid, or if no geometry columns are specified.
+
+    Examples:
+        >>> data = [{"town": "Null Island", "loc": "POINT(0 0)"}]
+        >>> postgres_adapter(data, geometry="loc", srid=4326)
+        [DltResource with hints applied]
+    """
+    resource = get_resource_for_adapter(data)
+
+    column_hints: TTableSchemaColumns = {}
+
+    if geometry:
+        if isinstance(geometry, str):
+            geometry = [geometry]
+        if not isinstance(geometry, list):
+            raise ValueError(
+                "'geometry' must be a list of column names or a single column name as a string."
+            )
+
+        for column_name in geometry:
+            column_hints[column_name] = {
+                "name": column_name,
+                GEOMETRY_HINT: True,  # type: ignore[misc]
+            }
+            if srid is not None:
+                column_hints[column_name][SRID_HINT] = srid  # type: ignore
+
+    if not column_hints:
+        raise ValueError("A value for 'geometry' must be specified.")
+    else:
+        resource.apply_hints(columns=column_hints)
+    return resource