From 2eb8cfe54eb4d6d3a595373ce2551f0ae6089d06 Mon Sep 17 00:00:00 2001 From: Violetta Mishechkina Date: Tue, 1 Oct 2024 15:00:58 +0200 Subject: [PATCH] Docs: Add sftp option for filesystem source (#1845) --- .../dlt-ecosystem/destinations/filesystem.md | 5 ++- .../verified-sources/filesystem/basic.md | 36 +++++++++++++++++-- .../verified-sources/filesystem/index.md | 7 ++-- docs/website/docs/tutorial/filesystem.md | 2 +- docs/website/sidebars.js | 2 +- 5 files changed, 44 insertions(+), 8 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index a456fa6e7d..2be382c326 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -302,7 +302,10 @@ sftp_gss_deleg_creds # Delegate credentials with GSS-API, defaults to True sftp_gss_host # Host for GSS-API, defaults to None sftp_gss_trust_dns # Trust DNS for GSS-API, defaults to True ``` -> For more information about credentials parameters: https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect + +:::info +For more information about credentials parameters: https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect +::: ### Authentication methods diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md index 847ff64bf1..6eb02b4edf 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md @@ -6,7 +6,7 @@ keywords: [readers source and filesystem, files, filesystem, readers source, clo import Header from '../_source-info-header.md';
-Filesystem source allows loading files from remote locations (AWS S3, Google Cloud Storage, Google Drive, Azure) or the local filesystem seamlessly. Filesystem source natively supports `csv`, `parquet`, and `jsonl` files and allows customization for loading any type of structured files. +Filesystem source allows loading files from remote locations (AWS S3, Google Cloud Storage, Google Drive, Azure Blob Storage, SFTP server) or the local filesystem seamlessly. Filesystem source natively supports `csv`, `parquet`, and `jsonl` files and allows customization for loading any type of structured files. To load unstructured data (`.pdf`, `.txt`, e-mail), please refer to the [unstructured data source](https://github.com/dlt-hub/verified-sources/tree/master/sources/unstructured_data). @@ -75,6 +75,7 @@ To get started with your data pipeline, follow these steps: {"label": "AWS S3", "value": "aws"}, {"label": "GCS/GDrive", "value": "gcp"}, {"label": "Azure", "value": "azure"}, + {"label": "SFTP", "value": "sftp"}, {"label": "Local filesystem", "value": "local"}, ]}> @@ -122,6 +123,18 @@ For more info, see + + +dlt supports several authentication methods: + +1. Key-based authentication +2. SSH Agent-based authentication +3. Username/Password authentication +4. GSS-API authentication + +Learn more about SFTP authentication options in [SFTP section](../../destinations/filesystem#sftp). To obtain credentials, contact your server administrator. + + You don't need any credentials for the local filesystem. @@ -143,6 +156,7 @@ a bucket, can be specified in `config.toml`. {"label": "AWS S3", "value": "aws"}, {"label": "GCS/GDrive", "value": "gcp"}, {"label": "Azure", "value": "azure"}, + {"label": "SFTP", "value": "sftp"}, {"label": "Local filesystem", "value": "local"}, ]}> @@ -195,6 +209,24 @@ bucket_url="gs:////" ``` + + +Learn how to set up SFTP credentials for each authentication method in the [SFTP section](../../destinations/filesystem#sftp). +For example, in case of key-based authentication, you can configure the source the following way: + +```toml +# secrets.toml +[sources.filesystem.credentials] +sftp_username = "foo" +sftp_key_filename = "/path/to/id_rsa" # Replace with the path to your private key file +sftp_key_passphrase = "your_passphrase" # Optional: passphrase for your private key + +# config.toml +[sources.filesystem] # use [sources.readers.credentials] for the "readers" source +bucket_url = "sftp://[hostname]/[path]" +``` + + You can use both native local filesystem paths and `file://` URI. Absolute, relative, and UNC Windows paths are supported. @@ -219,7 +251,7 @@ bucket_url='~\Documents\csv_files\' You can also specify the credentials using Environment variables. The name of the corresponding environment -variable should be slightly different than the corresponding name in the `toml` file. Simply replace dots `.` with double +variable should be slightly different from the corresponding name in the `toml` file. Simply replace dots `.` with double underscores `__`: ```sh diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md index 32e0df77c2..1441931340 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md @@ -1,6 +1,6 @@ --- -title: Filesystem & Buckets -description: dlt-verified source for Filesystem & Buckets +title: Filesystem & cloud storage +description: dlt-verified source for Filesystem & cloud storage keywords: [readers source and filesystem, files, filesystem, readers source, cloud storage] --- @@ -8,7 +8,8 @@ The Filesystem source allows seamless loading of files from the following locati * AWS S3 * Google Cloud Storage * Google Drive -* Azure +* Azure Blob Storage +* remote filesystem (via SFTP) * local filesystem The Filesystem source natively supports `csv`, `parquet`, and `jsonl` files and allows customization for loading any type of structured files. diff --git a/docs/website/docs/tutorial/filesystem.md b/docs/website/docs/tutorial/filesystem.md index b2555db39b..f939cc1f4f 100644 --- a/docs/website/docs/tutorial/filesystem.md +++ b/docs/website/docs/tutorial/filesystem.md @@ -4,7 +4,7 @@ description: Learn how to load data files like JSON, JSONL, CSV, and Parquet fro keywords: [dlt, tutorial, filesystem, cloud storage, file system, python, data pipeline, incremental loading, json, jsonl, csv, parquet, duckdb] --- -This tutorial is for you if you need to load data files like JSONL, CSV, and Parquet from either Cloud Storage (e.g., AWS S3, Google Cloud Storage, Google Drive, Azure Blob Storage) or a local file system. +This tutorial is for you if you need to load data files like JSONL, CSV, and Parquet from either Cloud Storage (e.g., AWS S3, Google Cloud Storage, Google Drive, Azure Blob Storage), a remote (SFTP), or a local file system. ## What you will learn diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 7e6000a2ca..32bb554842 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -67,7 +67,7 @@ const sidebars = { { type: 'category', label: 'Filesystem & cloud storage', - description: 'AWS S3, Google Cloud Storage, Azure Blob Storage, local file system', + description: 'AWS S3, Google Cloud Storage, Azure, SFTP, local file system', link: { type: 'doc', id: 'dlt-ecosystem/verified-sources/filesystem/index',