diff --git a/docs/website/docs/general-usage/customising-pipelines/removing_columns.md b/docs/website/docs/general-usage/customising-pipelines/removing_columns.md index 39f38579f4..fa936566d0 100644 --- a/docs/website/docs/general-usage/customising-pipelines/removing_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/removing_columns.md @@ -10,72 +10,82 @@ Removing columns before loading data into a database is a reliable method to eli unnecessary fields. For example, in the given scenario, a source is created with a "country_id" column, which is then excluded from the database before loading. -```python -import dlt - -@dlt.source -def dummy_source(prefix: str = None): - #This function creates a dummy data source. - @dlt.resource(write_disposition='replace') - def dummy_data(): - for _ in range(3): - yield {'id': _, 'name': f'Jane Washington {_}', 'country_id': 90 + _} - return dummy_data(), - -def remove_columns(doc, remove_columns=None): - """ - Removes specified columns from a document (row of data). - - This function is used to filter out columns from the data before loading it into a database, - which can be useful for excluding sensitive or unnecessary information. - - :param doc: The document (row) from which columns will be removed. - :param remove_columns: List of column names to be removed, defaults to None. - :return: The document with specified columns removed. - """ - - if remove_columns is None: - remove_columns = [] - - # Iterating over the list of columns to be removed - for column_name in remove_columns: - # Removing the column if it exists in the document - if column_name in doc: - del doc[column_name] - - return doc - -# Example usage: -remove_columns_list = ["country_id"] - -# run it as it is -for row in dummy_source().dummy_data.add_map( - lambda doc: remove_columns(doc, remove_columns_list)): - print(row) - -#{'id': 0, 'name': 'Jane Washington 0'} -#{'id': 1, 'name': 'Jane Washington 1'} -#{'id': 2, 'name': 'Jane Washington 2'} - - -# Or create an instance of the data source, modify the resource and run the source. - -# 1. Create an instance of the source so you can edit it. -data_source = dummy_source() -# 2. Modify this source instance's resource -data_source = ( - data_source.dummy_data.add_map( - lambda doc: remove_columns(doc, remove_columns_list) - ) -)# 3. Inspect your result -for row in data_source: - print(row) - -# Integrating with a DLT pipeline -pipeline = dlt.pipeline( - pipeline_name='example', - destination='bigquery', - dataset_name='filtered_data' -) -load_info = pipeline.run(data_source) -``` \ No newline at end of file +Let's create a sample pipeline demonstrating the process of removing a column. + +1. Create a source function that creates dummy data as follows: + + ```python + import dlt + + @dlt.source + def dummy_source(prefix: str = None): + #This function creates a dummy data source. + @dlt.resource(write_disposition='replace') + def dummy_data(): + for _ in range(3): + yield {'id': _, 'name': f'Jane Washington {_}', 'country_code': 40 + _} + return dummy_data() + ``` + This function creates three columns `id`, `name` and `country_code`. + +1. Next, create a function to filter out columns from the data before loading it into a database as follows: + + ```python + def remove_columns(doc, remove_columns=None): + + + if remove_columns is None: + remove_columns = [] + + # Iterating over the list of columns to be removed + for column_name in remove_columns: + # Removing the column if it exists in the document + if column_name in doc: + del doc[column_name] + + return doc + ``` + `doc`: The document (row) from which columns will be removed. + + `remove_columns`: List of column names to be removed, defaults to None. + +1. Next, declare the columns to be removed from the table, and then modify the source as follows: + + ```python + # Example columns to remove: + remove_columns_list = ["country_code"] + + # 1. Create an instance of the source so you can edit it. + data_source = dummy_source() + + # 2. Modify this source instance's resource + data_source = ( + data_source.dummy_data.add_map( + lambda doc: remove_columns(doc, remove_columns_list) + ) + ) + ``` +1. You can optionally inspect the result: + + ```python + for row in data_source: + print(row) + #{'id': 0, 'name': 'Jane Washington 0'} + #{'id': 1, 'name': 'Jane Washington 1'} + #{'id': 2, 'name': 'Jane Washington 2'} + ``` + +1. At last, create a pipeline: + + ```python + # Integrating with a DLT pipeline + pipeline = dlt.pipeline( + pipeline_name='example', + destination='bigquery', + dataset_name='filtered_data' + ) + + load_info = pipeline.run(data_source) + print( load_info) + ``` +