From a5989215fea6a498d9adab4f6a0877c94d413b87 Mon Sep 17 00:00:00 2001 From: Violetta Mishechkina Date: Thu, 19 Sep 2024 14:55:45 +0200 Subject: [PATCH 1/3] Add sftp option for filesystem source --- .../dlt-ecosystem/destinations/filesystem.md | 5 ++- .../verified-sources/filesystem/basic.md | 36 +++++++++++++++++-- .../verified-sources/filesystem/index.md | 1 + docs/website/docs/tutorial/filesystem.md | 2 +- docs/website/sidebars.js | 2 +- 5 files changed, 41 insertions(+), 5 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index a456fa6e7d..2be382c326 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -302,7 +302,10 @@ sftp_gss_deleg_creds # Delegate credentials with GSS-API, defaults to True sftp_gss_host # Host for GSS-API, defaults to None sftp_gss_trust_dns # Trust DNS for GSS-API, defaults to True ``` -> For more information about credentials parameters: https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect + +:::info +For more information about credentials parameters: https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect +::: ### Authentication methods diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md index 847ff64bf1..599944ae29 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md @@ -6,7 +6,7 @@ keywords: [readers source and filesystem, files, filesystem, readers source, clo import Header from '../_source-info-header.md';
-Filesystem source allows loading files from remote locations (AWS S3, Google Cloud Storage, Google Drive, Azure) or the local filesystem seamlessly. Filesystem source natively supports `csv`, `parquet`, and `jsonl` files and allows customization for loading any type of structured files. +Filesystem source allows loading files from remote locations (AWS S3, Google Cloud Storage, Google Drive, Azure, sftp server) or the local filesystem seamlessly. Filesystem source natively supports `csv`, `parquet`, and `jsonl` files and allows customization for loading any type of structured files. To load unstructured data (`.pdf`, `.txt`, e-mail), please refer to the [unstructured data source](https://github.com/dlt-hub/verified-sources/tree/master/sources/unstructured_data). @@ -75,6 +75,7 @@ To get started with your data pipeline, follow these steps: {"label": "AWS S3", "value": "aws"}, {"label": "GCS/GDrive", "value": "gcp"}, {"label": "Azure", "value": "azure"}, + {"label": "SFTP", "value": "sftp"}, {"label": "Local filesystem", "value": "local"}, ]}> @@ -122,6 +123,18 @@ For more info, see + + +dlt supports several authentication methods: + +1. Key-based authentication +2. SSH Agent-based authentication +3. Username/Password authentication +4. GSS-API authentication + +Learn more about sftp authentication options in [SFTP section](../../destinations/filesystem#sftp). To obtain credentials, contact your server administrator. + + You don't need any credentials for the local filesystem. @@ -143,6 +156,7 @@ a bucket, can be specified in `config.toml`. {"label": "AWS S3", "value": "aws"}, {"label": "GCS/GDrive", "value": "gcp"}, {"label": "Azure", "value": "azure"}, + {"label": "SFTP", "value": "sftp"}, {"label": "Local filesystem", "value": "local"}, ]}> @@ -195,6 +209,24 @@ bucket_url="gs:////" ``` + + +Learn how to set up sftp credentials for each authentication method in the [SFTP section](../../destinations/filesystem#sftp). +For example, in case of key-based authentication, you can configure the source the following way: + +```toml +# secrets.toml +[sources.filesystem.credentials] +sftp_username = "foo" +sftp_key_filename = "/path/to/id_rsa" # Replace with the path to your private key file +sftp_key_passphrase = "your_passphrase" # Optional: passphrase for your private key + +# config.toml +[sources.filesystem] # use [sources.readers.credentials] for the "readers" source +bucket_url = "sftp://[hostname]/[path]" +``` + + You can use both native local filesystem paths and `file://` URI. Absolute, relative, and UNC Windows paths are supported. @@ -219,7 +251,7 @@ bucket_url='~\Documents\csv_files\' You can also specify the credentials using Environment variables. The name of the corresponding environment -variable should be slightly different than the corresponding name in the `toml` file. Simply replace dots `.` with double +variable should be slightly different from the corresponding name in the `toml` file. Simply replace dots `.` with double underscores `__`: ```sh diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md index 32e0df77c2..a7d308a64e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md @@ -9,6 +9,7 @@ The Filesystem source allows seamless loading of files from the following locati * Google Cloud Storage * Google Drive * Azure +* remote filesystem (via sftp) * local filesystem The Filesystem source natively supports `csv`, `parquet`, and `jsonl` files and allows customization for loading any type of structured files. diff --git a/docs/website/docs/tutorial/filesystem.md b/docs/website/docs/tutorial/filesystem.md index 6d30eed3e6..b169c7160c 100644 --- a/docs/website/docs/tutorial/filesystem.md +++ b/docs/website/docs/tutorial/filesystem.md @@ -4,7 +4,7 @@ description: Learn how to load data files like JSON, JSONL, CSV, and Parquet fro keywords: [dlt, tutorial, filesystem, cloud storage, file system, python, data pipeline, incremental loading, json, jsonl, csv, parquet, duckdb] --- -This tutorial is for you if you need to load data files like JSONL, CSV, and Parquet from either Cloud Storage (e.g., AWS S3, Google Cloud Storage, Google Drive, Azure Blob Storage) or a local file system. +This tutorial is for you if you need to load data files like JSONL, CSV, and Parquet from either Cloud Storage (e.g., AWS S3, Google Cloud Storage, Google Drive, Azure Blob Storage), a remote (SFTP), or a local file system. ## What you will learn diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 7e6000a2ca..bbcb458ebb 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -67,7 +67,7 @@ const sidebars = { { type: 'category', label: 'Filesystem & cloud storage', - description: 'AWS S3, Google Cloud Storage, Azure Blob Storage, local file system', + description: 'AWS S3, Google Cloud Storage, Azure, sftp, local file system', link: { type: 'doc', id: 'dlt-ecosystem/verified-sources/filesystem/index', From 8b410d2dadea096fb9187c9d53302682a3b38127 Mon Sep 17 00:00:00 2001 From: Violetta Mishechkina Date: Mon, 30 Sep 2024 18:34:42 +0200 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Anton Burnashev --- .../docs/dlt-ecosystem/verified-sources/filesystem/basic.md | 6 +++--- .../docs/dlt-ecosystem/verified-sources/filesystem/index.md | 4 ++-- docs/website/sidebars.js | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md index 599944ae29..6eb02b4edf 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md @@ -6,7 +6,7 @@ keywords: [readers source and filesystem, files, filesystem, readers source, clo import Header from '../_source-info-header.md';
-Filesystem source allows loading files from remote locations (AWS S3, Google Cloud Storage, Google Drive, Azure, sftp server) or the local filesystem seamlessly. Filesystem source natively supports `csv`, `parquet`, and `jsonl` files and allows customization for loading any type of structured files. +Filesystem source allows loading files from remote locations (AWS S3, Google Cloud Storage, Google Drive, Azure Blob Storage, SFTP server) or the local filesystem seamlessly. Filesystem source natively supports `csv`, `parquet`, and `jsonl` files and allows customization for loading any type of structured files. To load unstructured data (`.pdf`, `.txt`, e-mail), please refer to the [unstructured data source](https://github.com/dlt-hub/verified-sources/tree/master/sources/unstructured_data). @@ -132,7 +132,7 @@ dlt supports several authentication methods: 3. Username/Password authentication 4. GSS-API authentication -Learn more about sftp authentication options in [SFTP section](../../destinations/filesystem#sftp). To obtain credentials, contact your server administrator. +Learn more about SFTP authentication options in [SFTP section](../../destinations/filesystem#sftp). To obtain credentials, contact your server administrator. @@ -211,7 +211,7 @@ bucket_url="gs:////" -Learn how to set up sftp credentials for each authentication method in the [SFTP section](../../destinations/filesystem#sftp). +Learn how to set up SFTP credentials for each authentication method in the [SFTP section](../../destinations/filesystem#sftp). For example, in case of key-based authentication, you can configure the source the following way: ```toml diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md index a7d308a64e..bf2087d4b1 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md @@ -8,8 +8,8 @@ The Filesystem source allows seamless loading of files from the following locati * AWS S3 * Google Cloud Storage * Google Drive -* Azure -* remote filesystem (via sftp) +* Azure Blob Storage +* remote filesystem (via SFTP) * local filesystem The Filesystem source natively supports `csv`, `parquet`, and `jsonl` files and allows customization for loading any type of structured files. diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index bbcb458ebb..32bb554842 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -67,7 +67,7 @@ const sidebars = { { type: 'category', label: 'Filesystem & cloud storage', - description: 'AWS S3, Google Cloud Storage, Azure, sftp, local file system', + description: 'AWS S3, Google Cloud Storage, Azure, SFTP, local file system', link: { type: 'doc', id: 'dlt-ecosystem/verified-sources/filesystem/index', From 4757831c9ce870d3a3129b8792a203341025a691 Mon Sep 17 00:00:00 2001 From: Violetta Mishechkina Date: Mon, 30 Sep 2024 18:38:20 +0200 Subject: [PATCH 3/3] Update the index page name --- .../docs/dlt-ecosystem/verified-sources/filesystem/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md index bf2087d4b1..1441931340 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md @@ -1,6 +1,6 @@ --- -title: Filesystem & Buckets -description: dlt-verified source for Filesystem & Buckets +title: Filesystem & cloud storage +description: dlt-verified source for Filesystem & cloud storage keywords: [readers source and filesystem, files, filesystem, readers source, cloud storage] ---