conf/jobs_metadata.yml

# Jobs details below. Common job params listed at the bottom of this file.
jobs:
  examples/ex0_extraction_job.py:
    description: "Sample API extraction job, pulling public wikipedia data."
    api_inputs: {'path': 'https://raw.githubusercontent.com/wikimedia-research/Discovery-Hiring-Analyst-2016/master/events_log.csv.gz'}
    output: {'path':'{{base_path}}/wiki_example/input/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    frequency: '@once'
    spark_boot: False

  examples/ex1_sql_job.sql:
    description: "shows sql job, easiest when sql is enough"
    py_job: 'jobs/generic/sql_pandas_job.py'
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv', 'df_type':'pandas'}
      other_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv', 'df_type':'pandas'}
    output: {'path':'{{base_path}}/wiki_example_sql/output_ex1_sql_pandas/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    dependencies: [examples/ex0_extraction_job.py]
    spark_boot: False
    frequency: '@daily'
    start_date: '{today}T07:00:00+00:00'
    emails: ['some_email@address.com']

  examples/ex7_pandas_job.py:
    description: "job loading and processing data with pandas. No spark involved."
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv', 'df_type':'pandas'}
      other_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv', 'df_type':'pandas', 'read_kwargs':{}}
    output: {'path':'{{base_path}}/wiki_example/output_ex7_pandas/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas', 'save_kwargs':{'sep':'|'}}
    dependencies: [examples/ex0_extraction_job.py]
    frequency: '@once'
    airflow.default_args.retries: 3
    airflow.default_args.retry_delay: 'timedelta(minutes=5)'
    spark_boot: False

  examples/ex1_sql_spark_job:
    description: "shows sql job, using spark, for large datasets."
    py_job: 'jobs/generic/sql_spark_job.py'
    sql_file: 'jobs/examples/ex1_sql_job.sql'
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
      other_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
    output: {'path':'{{base_path}}/wiki_example_sql/output_ex1_sql_spark/{{now}}/', 'type':'csv'}
    dependencies: [examples/ex0_extraction_job.py]
    repartition: 1
    frequency: '@once'
    emails: ['some_email@address.com']

  examples/ex1_frameworked_job.py:
    description: "shows frameworked pyspark ops, same as ex1_sql_job but gives access to spark ops to expand on sql."
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
      other_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
    output: {'path':'{{base_path}}/wiki_example/output_ex1_frameworked/{{now}}/', 'type':'csv'}
    dependencies: [examples/ex0_extraction_job.py]
    frequency: 1 day
    start_date: "{today}T07:00:00+00:00"
    emails: ['some_email@address.com']

  job_using_generic_template:
    description: "to show how to reuse existing job code"
    py_job: 'jobs/examples/ex1_frameworked_job.py'
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
      other_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
    output: {'path':'{{base_path}}/wiki_example/output_ex1_frameworked_p2/{{now}}/', 'type':'csv'}
    dependencies: [examples/ex0_extraction_job.py]

  examples/ex1_raw_job.py:
    frequency: 24h
    emails: ['some_email@address.com']

  examples/ex2_frameworked_job.py:
    description: "more complex version of ex1_frameworked_job"
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
      other_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
    output: {'path':'{{base_path}}/wiki_example/output_ex2/{{now}}/', 'type':'csv'}
    frequency: 1 day
    start_date: "{today}T07:00:00+00:00"
    emails: ['some_email@address.com']

  examples/ex3_incremental_job.py:
    description: "focus on incremental loading and dropping"
    inputs:
      processed_events: {'path':"{{base_path}}/wiki_example/output_ex3_dep/{{latest}}/", 'type':'csv', 'inc_field': 'timestamp_obj'}
    output: {'path':'{{base_path}}/wiki_example/output_ex3_inc/incremental_build_v1/', 'type':'csv', 'inc_field': 'other_timestamp'}
    dependencies: [examples/ex3_incremental_prep_job.py]
    frequency: 24h
    emails: ['some_email@address.com']

  examples/ex3_incremental_prep_job.py:
    description: "shows computation of dependency as necessary for ex3_incremental_job"
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
    output: {'path':'{{base_path}}/wiki_example/output_ex3_dep/{{now}}/', 'type':'csv'}
    frequency: 24h
    emails: ['some_email@address.com']

  examples/ex4_dependency1_job.py:
    description: "shows dependency"
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
    output: {'path':'{{base_path}}/wiki_example/output_ex4_dep1/{{now}}/', 'type':'csv'}

  examples/ex4_dependency2_job.py:
    description: "shows dependency without specifying inputs path since it is pulled from upstream."
    inputs:
      some_events: {'path':'{{base_path}}/wiki_example/output_ex4_dep1/{{latest}}/', 'type':'csv', 'df_type':'pandas'}
    output: {'path':'{{base_path}}/wiki_example/output_ex4_dep2/{{now}}/dataset.csv', 'df_type':'pandas', 'type':'csv'}
    dependencies: [examples/ex4_dependency1_job.py]

  examples/ex4_dependency3_job.sql:
    description: "shows dependency with sql"
    py_job: 'jobs/generic/sql_pandas_job.py'
    inputs:
      some_events: {'path':'{{base_path}}/wiki_example/output_ex4_dep2/{{latest}}/', 'type':'csv', 'df_type':'pandas', 'from':'examples/ex4_dependency2_job.py'}  # 'path' not needed when run as dependency
    output: {'path':'{{base_path}}/wiki_example/output_ex4_dep3/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    dependencies: [examples/ex4_dependency2_job.py, examples/ex4_dependency1_job.py]
    spark_boot: False

  examples/ex4_dependency4_job.py:
    description: "shows dependency"
    inputs:
      some_events: {'path':'{{base_path}}/wiki_example/output_ex4_dep3/{{latest}}/', 'type':'csv', 'from':'examples/ex4_dependency3_job.sql'}
    output: {'path':'{{base_path}}/wiki_example/output_ex4_dep4/{{now}}/', 'type':'csv'}
    dependencies: [examples/ex4_dependency3_job.sql]

  examples/ex5_copy_to_oracle_job.py:
    description: "shows frameworked pyspark ops, same as ex1_sql_job but gives access to spark ops to expand on sql."
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
      other_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
    output: {'path':'{{base_path}}/wiki_example/output_ex5_copy_to_oracle/{{now}}/', 'type':'csv'}
    copy_to_oracle: {'creds': 'oracle', 'table': 'sandbox.test_ex5_pyspark_job'}

  examples/ex5_copy_to_redshift_job.py:
    description: "shows frameworked pyspark ops, same as ex1_sql_job but gives access to spark ops to expand on sql."
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
      other_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
    output: {'path':'{{base_path}}/wiki_example/output_ex5_copy_to_oracle/{{now}}/', 'type':'csv'}
    copy_to_redshift: {'creds': 'some_redshift_cred_section', 'table': '{schema}.test_ex5_pyspark_job'}

  examples/ex6_mysql_job.py:
    description: "requires mysql instance running"
    api_inputs: {'api_creds': 'some_mysql_cred_section', 'note':'API Job that relies on creds from conf/connections.cfg'}
    output: {'path':'{{base_path}}/mysql_example/output_ex6_mysql/{{now}}/', 'type':'csv'}

  examples/job_with_no_output:
    description: "shows job with no output (still requiring table as output but not dumped to disk)"
    py_job: 'jobs/examples/ex1_frameworked_job.py'
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
      other_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
    output: {'path':'n/a', 'type':'None'}

  examples/job_with_more_resources:
    description: "To show how to change the machine specs and size of the cluster. See ec2_instance_master and emr_core_instances params below"
    py_job: 'jobs/examples/ex1_frameworked_job.py'
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
      other_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
    output: {'path':'{{base_path}}/wiki_example/output_ex1_frameworked_p3/{{now}}/', 'type':'csv'}
    ec2_instance_master: 'm5.4xlarge'
    ec2_instance_slaves: 'm5.4xlarge'
    emr_core_instances: 3
    emr_applications: [{'Name': 'Hadoop'}, {'Name': 'Spark'}, {'Name': 'Livy'}, {'Name': 'JupyterHub'}]

  examples/ex7_pandas_with_sql_job.py:
    description: "job loading and processing data with pandas. No spark involved."
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv', 'df_type':'pandas'}
      other_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv', 'df_type':'pandas', 'read_kwargs':{}}
    output: {'path':'{{base_path}}/wiki_example/output_ex7_pandas_with_sql/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas', 'save_kwargs':{'sep':'|'}}
    spark_boot: False

  examples/ex7_hybrid_pandas_spark_job.py:
    description: "Job processing in pandas, loading and dropping done with spark."
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
      other_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
    output: {'path':'{{base_path}}/wiki_example/output_ex7_hybrid_pandas_spark/{{now}}/', 'type':'csv'}

  examples/ex8_koalas_job.py:
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
      other_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
    output: {'path':'{{base_path}}/wiki_example/output_ex8_koalas/{{now}}/', 'type':'csv'}

  examples/ex9_redshift_job.py:
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv'}
    output: {'path':'{{base_path}}/wiki_example/output_ex9_redshift/{{now}}/', 'type':'csv'}
    copy_to_redshift: {'creds': 'some_redshift_cred_section', 'table': '{schema}.test_ex9_redshift'}

  examples/ex9_mysql_job.py:
    db_inputs: {'creds': 'some_mysql_cred_section', 'note':'API Job that relies on creds from conf/connections.cfg'}
    output: {'path':'{{base_path}}/wiki_example/output_ex9_mysql/{{now}}/', 'type':'csv'}

  examples/ex9_mysql_framework_load_job:
    py_job: jobs/generic/copy_job.py
    inputs:
      table_to_copy: {'type':'mysql', 'db_table': 'some_schema.some_table', 'creds': 'some_mysql_cred_section', 'note':'creds defined in conf/connections.cfg'}
    output: {'path':'{{base_path}}/db_example/output_ex9_mysql_direct/{{now}}/', 'type':'csv'}
    load_connectors: all
    enable_db_push: True

  examples/ex9_clickhouse_job.py:
    db_inputs: {'creds': 'some_clickhouse_cred_section', 'note':'API Job that relies on creds from conf/connections.cfg'}
    output: {'path':'{{base_path}}/db_example/output_ex9_clickhouse/{{now}}/', 'type':'csv'}

  examples/ex9_clickhouse_framework_load_job:
    py_job: jobs/generic/copy_job.py
    inputs:
      table_to_copy: {'type':'clickhouse', 'db_table': 'some_schema.some_table', 'creds': 'some_clickhouse_cred_section', 'note':'creds defined in conf/connections.cfg'}
    output: {'path':'{{base_path}}/db_example/output_ex9_clickhouse_direct/{{now}}/', 'type':'csv'}
    copy_to_clickhouse: {'creds': 'some_clickhouse_cred_section', 'table': 'public.test_push_arthur'}
    load_connectors: all
    enable_db_push: True

  examples/ex10_excel_load_job:
    py_job: jobs/generic/copy_job.py
    inputs:
      table_to_copy: {'path':"tests/fixtures/data_sample/wiki_example/input_excel/parts.xlsx", 'type':'xlsx', 'df_type':'pandas', 'read_kwargs':{'engine': 'openpyxl', 'sheet_name': 0, 'header': 1}}  # for xls files, use 'type':'xls' and 'engine': 'xlrd'
    output: {'path':'{{base_path}}/load_example/excel/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    spark_boot: False

  examples/copy_local_to_cloud_job:
    description: "Copy data from local to cloud. To be executed from local only."
    py_job: jobs/generic/copy_job.py
    inputs:
      table_to_copy: {'path':"tests/fixtures/data_sample/wiki_example/input/", 'type':'csv'}
    output: {'path':'{{base_path}}/load_example/test_files/{{now}}/', 'type':'csv'}
    base_path: 's3a://mylake-dev/pipelines_data'
    aws_config_file:  conf/aws_config.cfg
    aws_setup: dev
    load_connectors: all

  examples/copy_cloud_to_local_job:
    description: "Copy data from cloud to local. Not loading file content. To be run locally."
    py_job: jobs/generic/copy_raw_job.py
    inputs:
      files_to_copy: {'path':"{{base_path}}/load_example/test_files/", 'glob':'*/*.csv', 'type':'other'}
    output: {'path':'./data/load_example/test_files/{{now}}/'}
    base_path: 's3://mylake-dev/pipelines_data'
    aws_config_file:  conf/aws_config.cfg
    aws_setup: dev
    spark_boot: False

  examples/ex11_run_gpu_for_deeplearning.py:
    description: ""
    inputs:
      training_set: {'path':'{{base_path}}/gen_ai/finetuned_albert/raw/{{latest}}/', 'type':'csv', 'df_type':'pandas'}  # file available in repo in ./tests/fixtures/data_sample/gen_ai/text_classification.csv
    output: {'path':'{{base_path}}/load_example/test_files/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    ec2_instance_master: 'g4dn.xlarge'  # access to this GPU instance may requires special request to AWS.
    emr_core_instances: 0

  examples/wordcount_frameworked_job.py:
    description: "shows raw pyspark rdd ops in framework, same as wordcount_raw_job"
    inputs:
      lines: {'path':"{{base_path}}/wordcount_example/input/sample_text.txt", 'type':'txt'}
    output: {'path':'{{base_path}}/wordcount_example/output_frameworked/{{now}}/', 'type':'txt'}
    dependencies: [] # list here if any
    frequency: 24h
    emails: ['some_email@address.com']

  examples/run_jobs:
    description: "shows a job meant to run several dependencies at once. The job itself takes no input and generates no output."
    py_job: 'jobs/generic/dummy_job.py'
    spark_boot: False
    dependencies: 
      - examples/ex0_extraction_job.py
      - examples/ex1_sql_job.sql
      - examples/ex7_pandas_job.py

  examples/ex12_run_scala_job:
    description: "Sample spark scala code (compiled into a jar), executed through spark-submit"
    jar_job: 'jobs/examples/ex12_scala_job/target/spark_scala_job_2.13-1.0.jar'
    scala_job: 'jobs/examples/ex12_scala_job/src/spark_scala_job.scala'  # for ref, compilation to be done manually
    sbt: 'jobs/examples/ex12_scala_job/build.sbt'  # for ref, compilation to be done manually
    spark_submit_args: '--verbose'
    spark_app_args: '{{base_path}}/wordcount_example/input/sample_text.txt'
    load_connectors: none
  
  examples/ex13_register_athena_job:
    description: "Job to demo registering data to Athena. Setup to be run locally, while pushing data to cloud and registering data to athena in AWS."
    py_job: jobs/examples/ex1_frameworked_job.py
    inputs:
      some_events: {'path':"./data/wiki_example/input/{{latest}}/", 'type':'csv'}
      other_events: {'path':"./data/wiki_example/input/{{latest}}/", 'type':'csv'}
    output: {'path':'{{base_path}}/wiki_example/output_ex7_pandas/{{now}}/', 'type':'csv'}
    base_path: 's3a://mylake-dev/pipelines_data'
    register_to_athena: {'table': 'sandbox.ex1_frameworked'}  # implies sandbox schema created, to be done with "CREATE DATABASE sandbox;" in SQL editor
    enable_db_push: True
    aws_config_file:  conf/aws_config.cfg
    aws_setup: dev
    athena_out: s3://mylake-dev/pipelines_data/athena_data/
    load_connectors: all

  examples/ex14_json_format1_load_job:
    description: "Format with basic json, with rows as lines, without []."
    py_job: jobs/generic/copy_job.py
    inputs:
      table_to_copy: {'path':"tests/fixtures/data_sample/wiki_example/input_json_format1/dataset.json", 'type':'json', 'df_type':'pandas', 'read_kwargs':{'lines':True}}
    output: {'path':'{{base_path}}/load_example/json_format1/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    spark_boot: False

  examples/ex14_json_format2_load_job:
    description: "Format with basic valid json, with rows inside brackets, ex [rows_here]."
    py_job: jobs/generic/copy_job.py
    inputs:
      table_to_copy: {'path':"tests/fixtures/data_sample/wiki_example/input_json_format2/dataset.json", 'type':'json', 'df_type':'pandas'}
    output: {'path':'{{base_path}}/load_example/json_format2/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    spark_boot: False

  examples/ex14_json_format3_load_job:
    description: "Format with rows inside structure like {'records':[rows_here]}."
    py_job: jobs/generic/copy_job.py
    inputs:
      table_to_copy: {'path':"tests/fixtures/data_sample/wiki_example/input_json_format3/dataset.json", 'type':'json', 'df_type':'json_pandas'}
    output: {'path':'{{base_path}}/load_example/json_format3/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    spark_boot: False

  examples/ex15_copy_job_multi_path.py:
    description: "Job to demo loading datasets with the same schemas spread over various folders, following structured path. The job will go through combinations of category and subcategory params defined below to extract datasets."
    inputs:
      table_to_copy: {'path':"tests/fixtures/data_sample/wiki_example/input_multipath/{category}/{subcategory}/dataset.csv", 'type':'csv', 'df_type':'pandas', 'load':False}
    output: {'path':'{{base_path}}/load_example/multipath/{category}/{subcategory}/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    category: ['catA', 'catB']
    subcategory: ['subcatA', 'subcatB']
    spark_boot: False

  examples/ex16_glob_load_job:
    py_job: jobs/generic/copy_job.py
    inputs:
      table_to_copy: {'path':"tests/fixtures/data_sample/wiki_example/input/", 'type':'csv', 'glob':'part1*', 'df_type':'pandas'}
    output: {'path':'{{base_path}}/load_example/glob_filtering/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    spark_boot: False

  examples/ex17_multimode_params_job:
    description: |
      Showing how to use multiple, useful for single tenant setup, where base_path need to change for various. see --mode=dev_local,your_extra_tenant below.
      usage: python jobs/generic/launcher.py --job_name=examples/ex17_multimode_params_job --mode=dev_local,your_extra_tenant
    py_job: jobs/examples/ex7_pandas_job.py
    inputs:
      some_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv', 'df_type':'pandas'}
      other_events: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'type':'csv', 'df_type':'pandas', 'read_kwargs':{}}
    output: {'path':'{{base_path}}/wiki_example/output_ex7_pandas/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas', 'save_kwargs':{'sep':'|'}}
    spark_boot: False

  examples/ex18_placeholder_param_replacement_job:
    description: |
      Showing use of key replacement in strings parameters, i.e. see {{base_path_in}} and {{base_path_out}} below, automatically replace by value of 'base_path_in' and 'base_path_out' params below.
      usage: python jobs/generic/launcher.py --job_name=examples/ex18_placeholder_param_replacement_job
    py_job: jobs/examples/ex7_pandas_job.py
    inputs:
      some_events: {'path':"{{base_path_in}}/wiki_example/input/{{latest}}/", 'type':'csv', 'df_type':'pandas'}
      other_events: {'path':"{{base_path_in}}/wiki_example/input/{{latest}}/", 'type':'csv', 'df_type':'pandas', 'read_kwargs':{}}
    output: {'path':'{{base_path_out}}/wiki_example/output_ex7_pandas/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas', 'save_kwargs':{'sep':'|'}}
    base_path_in: './data'
    base_path_out: './data_other'
    spark_boot: False

  examples/ex19_compare_pandas_job:
    description: "Showcasing a job to compare 2 datasets"
    py_job: jobs/generic/compare_job.py
    inputs:
      tableA: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'pk':['uuid'], 'type':'csv', 'df_type':'pandas'}
      tableB: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'pk':['uuid'], 'type':'csv', 'df_type':'pandas'}
    output: {'path':'{{base_path}}/wiki_example/output_ex19_compare/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    spark_boot: False

  examples/ex20_list_files_job:
    description: "Job to create a dataset from a file list. Meant to work in aws only."
    py_job: jobs/generic/list_files_job.py
    inputs:
      files: {'path':"{{base_path}}/wiki_example/input/{{latest}}/", 'glob':'*/*.csv', 'type':'other'}
    output: {'path':'{{base_path}}/wiki_example/output_ex20_filelist/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    spark_boot: False

  examples/ex21_deploy_airflow_job:
    description: "Test deployment to airflow."
    py_job: jobs/generic/copy_job.py
    inputs:
      table_to_copy: {'path':"tests/fixtures/data_sample/wiki_example/input/dataset.csv", 'type':'csv'}
    output: {'path':'{{base_path}}/load_example/test_files/{{now}}/dataset.csv', 'type':'csv'}
    s3_dags: 's3://mylake-dev/pipelines_metadata/airflow_dags'
    local_dags: './airflow_dags/'
    spark_boot: False

  # wordcount_raw_job: #Job exists but doesn't rely on jobs_metadata entries

  # ----- Marketing Jobs --------
  marketing/github_accounts_extraction_job.py:
    description: "Github API extraction job, pulling account data."
    api_inputs: {'creds': 'github'}
    inputs:
      github_accounts_man: {'path':"./data_manual/github/github_accounts.csv", 'type':'csv', 'df_type':'pandas'}
    output: {'path':'{{base_path}}/github/users/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    spark_boot: False

  marketing/github_repos_extraction_job.py:
    description: "Github API extraction job, pulling repo data."
    api_inputs: {'creds': 'github'}
    inputs:
      github_accounts: {'path':"{{base_path}}/github/users/{{latest}}/dataset.csv", 'type':'csv', 'df_type':'pandas'}
    output: {'path':'{{base_path}}/github/user_repos/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    dependencies: [marketing/github_accounts_extraction_job.py]
    spark_boot: False

  marketing/github_contributors_extraction_job.py:
    description: "Github API extraction job, pulling contributor data."
    api_inputs: {'creds': 'github'}
    inputs:
      repos: {'path':"{{base_path}}/github/user_repos/{{latest}}/dataset.csv", 'type':'csv', 'df_type':'pandas'}
    output: {'path':'{{base_path}}/github/contributors/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    dependencies: [marketing/github_repos_extraction_job.py]
    spark_boot: False

  marketing/github_committers_extraction_job.py:
    description: "Github API extraction job, pulling committer data, especially email."
    api_inputs: {'creds': 'github'}
    inputs:
      contributors: {'path':"{{base_path}}/github/contributors/{{latest}}/dataset.csv", 'type':'csv', 'df_type':'pandas'}
    output: {'path':'{{base_path}}/github/committers/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    dependencies: [marketing/github_contributors_extraction_job.py]
    spark_boot: False

  marketing/github_repo_list_extraction_job.py:
    description: "Github API extraction job, pulling repo data."
    api_inputs: {'creds': 'github'}
    output: {'path':'{{base_path}}/github/repos/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    spark_boot: False

  marketing/github_traffic_extraction_job.py:
    description: "Github API extraction job, pulling repo data."
    inputs:
      my_repos: {'path':"./data_manual/github/my_github_repos.csv", 'type':'csv', 'df_type':'pandas'}
    api_inputs: {'creds': 'github'}
    output: {'path':'{{base_path}}/github/traffic/{{now}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
    spark_boot: False

  marketing/people_merge_job.py:
    description: "Merge people from linkedin, github..."
    inputs:
      perso: {'path':"./data_manual/people/people.csv", 'type':'csv'}
      linkedin: {'path':"./data_manual/linkedin/export/Connections_mod.csv", 'type':'csv'}
    output: {'path':'{{base_path}}/people/{now}/', 'type':'csv'}

  # ----- Dashboards--------
  dashboards/wikipedia_demo_dashboard.ipynb:
    description: "Showcase a Dashboard using 'panel', based on dummy wikipedia data (experimental)."
    inputs:
      ex1_sql: {'path':'{{base_path}}/wiki_example_sql/output_ex1_sql_pandas/{{latest}}/dataset.csv', 'type':'csv', 'df_type':'pandas'}
      ex7_pandas: {'path':'{{base_path}}/wiki_example/output_ex7_pandas/{{latest}}/dataset.csv', 'type':'csv', 'df_type':'pandas', 'read_kwargs':{'sep':'|'}}
    dependencies:
      - examples/ex1_sql_job.sql
      - examples/ex7_pandas_job.py


# ----- Params -------
common_params:
  all_mode_params:
    base_path: '{{root_path}}/pipelines_data'  # don't add '/' at the end
    s3_dags: '{{root_path}}/pipelines_metadata/airflow_dags'  # determines which airflow instance to use.
    s3_logs: '{{root_path}}/pipelines_metadata'
    connection_file:  conf/connections.cfg
    redshift_s3_tmp_dir: s3a://dev-spark/tmp_spark/
    email_cred_section: some_email_cred_section  # Section from "connection_file"
    spark_version: '3.5' # options: '2.4', '3.0', '3.4' or '3.5'
    k8s_url: 'k8s://https://kubernetes.docker.internal:6443'
    k8s_name: 'my-pyspark-job'
    k8s_executor_instances: '2'
    k8s_namespace: 'a_k8s_namespace'
    k8s_image_service: 'a_k8s_image_service'
    k8s_upload_path: 's3a://a_k8s_upload_path'
    k8s_driver_podTemplateFile: conf/k8s_setup_spark_submit_driver.yaml
    k8s_executor_podTemplateFile: conf/k8s_setup_spark_submit_executor.yaml
    aws_region: eu-west-1  # TODO: remove this.
    k8s_podname: a_podname  # TODO: make it nullable so jobs can rerun back to back.
    default_aws_modes: 'dev_EMR'
    default_local_modes: 'dev_local'
    aws_modes: ['dev_EMR','prod_EMR']
  mode_specific_params:
    prod_EMR:
      root_path: s3://mylake-prod  # don't add '/' at the end
      schema: frontroom
      emr_core_instances: 0
      aws_config_file:  conf/aws_config.cfg
      aws_setup:        pro
      jobs_folder:      jobs/
      load_connectors: all
      enable_db_push: True
      save_schemas: False
      manage_git_info: True
    dev_EMR:  # TODO: change to dev_aws
      root_path: s3://mylake-dev  # don't add '/' at the end
      schema: sandbox
      emr_core_instances: 0
      aws_config_file:  conf/aws_config.cfg
      aws_setup:        dev
      jobs_folder:      jobs/
      load_connectors: all
      enable_db_push: False
      save_schemas: False
      manage_git_info: True
    dev_local:
      root_path: '.'  # don't add '/' at the end
      base_path: '{{root_path}}/data'  # don't add '/' at the end
      schema: sandbox
      load_connectors: none
      aws_config_file:  none
      enable_db_push: False
      save_schemas: True
      manage_git_info: False
    your_extra_tenant:  # useful for single tenant architecture, with data in separate locations
      save_schemas: False
      other_param: 'some_value'