From 4f62bdd2957a690105f4f71e24a121fd104566ae Mon Sep 17 00:00:00 2001 From: AlejandroUPC Date: Sat, 28 May 2022 12:24:53 +0200 Subject: [PATCH 01/10] :wrench: refactor ex0 --- jobs/examples/ex0_extraction_job.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/jobs/examples/ex0_extraction_job.py b/jobs/examples/ex0_extraction_job.py index 5f9d297e..171680d2 100644 --- a/jobs/examples/ex0_extraction_job.py +++ b/jobs/examples/ex0_extraction_job.py @@ -3,20 +3,22 @@ import requests import os import pandas as pd +from pyspark import sql class Job(ETL_Base): - def transform(self): - url = self.jargs.api_inputs['path'] + def transform(self) -> sql.DataFrame: + url = self.jargs.api_inputs["path"] resp = requests.get(url, allow_redirects=True) - self.logger.info('Finished reading file from {}.'.format(url)) + self.logger.info("Finished reading file from {}.".format(url)) # Save to local - tmp_dir = 'tmp' - os.makedirs(tmp_dir, exist_ok = True) - local_path = tmp_dir+'/tmp_file.csv.gz' - open(local_path, 'wb').write(resp.content) # creating local copy, necessary for sc_sql.read.csv, TODO: check to remove local copy step. - self.logger.info('Copied file locally at {}.'.format(local_path)) + os.makedirs(tmp_dir, exist_ok=True) + local_path = "tmp/tmp_file.csv.gz" + with open(local_path, "wb") as lp: + lp.write(resp.content) + # creating local copy, necessary for sc_sql.read.csv, TODO: check to remove local copy step. + self.logger.info(f"Copied file locally at {local_path}.") # Save as dataframe pdf = pd.read_csv(local_path) @@ -25,5 +27,5 @@ def transform(self): if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) From e8901562e867ba59d312e606ed00eb263beead94 Mon Sep 17 00:00:00 2001 From: AlejandroUPC Date: Sat, 28 May 2022 12:29:39 +0200 Subject: [PATCH 02/10] :wrench: refactor ex1 --- jobs/examples/ex1_frameworked_job.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/jobs/examples/ex1_frameworked_job.py b/jobs/examples/ex1_frameworked_job.py index 1f76d8ca..520c7c63 100644 --- a/jobs/examples/ex1_frameworked_job.py +++ b/jobs/examples/ex1_frameworked_job.py @@ -1,20 +1,22 @@ """Same as ex1_full_sql_job.sql but allows access to spark for more complex ops (not used here but in ex2_frameworked_job.py).""" from yaetos.etl_utils import ETL_Base, Commandliner +from pyspark import sql class Job(ETL_Base): - def transform(self, some_events, other_events): - df = self.query(""" + def transform(self, some_events, other_events) -> sql.DataFrame: + return self.query( + f""" SELECT se.session_id, count(*) as count_events - FROM some_events se - JOIN other_events oe on se.session_id=oe.session_id + FROM {some_events} se + JOIN {other_events} oe on se.session_id=oe.session_id WHERE se.action='searchResultPage' and se.n_results>0 group by se.session_id order by count(*) desc - """) - return df + """ + ) if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) From b8e47cf3433a7742187bcf79c87df739dedcf306 Mon Sep 17 00:00:00 2001 From: AlejandroUPC Date: Sat, 28 May 2022 12:32:10 +0200 Subject: [PATCH 03/10] :wrench: refactor ex1_raw_job --- jobs/examples/ex1_raw_job.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jobs/examples/ex1_raw_job.py b/jobs/examples/ex1_raw_job.py index 698b2885..37acbc73 100644 --- a/jobs/examples/ex1_raw_job.py +++ b/jobs/examples/ex1_raw_job.py @@ -22,14 +22,14 @@ # Start SparkContext -sc = SparkContext(appName='ex1_raw_job') +sc = SparkContext(appName="ex1_raw_job") sc_sql = SQLContext(sc) # Load data from S3 bucket some_events = sc_sql.read.csv(input_some_events, header=True) -some_events.createOrReplaceTempView('some_events') +some_events.createOrReplaceTempView("some_events") other_events = sc_sql.read.csv(input_other_events, header=True) -other_events.createOrReplaceTempView('other_events') +other_events.createOrReplaceTempView("other_events") # Calculate word counts query_str = """ From 8c24fc2942ad025d51a8aaa4f4ec44b51fcc6192 Mon Sep 17 00:00:00 2001 From: AlejandroUPC Date: Sat, 28 May 2022 13:09:40 +0200 Subject: [PATCH 04/10] :wrench: adding tests --- .../jobs/examples/test_ex2_frameworked_job.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 tests/jobs/examples/test_ex2_frameworked_job.py diff --git a/tests/jobs/examples/test_ex2_frameworked_job.py b/tests/jobs/examples/test_ex2_frameworked_job.py new file mode 100644 index 00000000..fe21b1b7 --- /dev/null +++ b/tests/jobs/examples/test_ex2_frameworked_job.py @@ -0,0 +1,24 @@ +import pytest +from jobs.examples.ex2_frameworked_job import Job +from datetime import datetime + + +def test_format_datetime_conversion_works() -> None: + data = "20220101000000" + expected = "2022-01-01 00:00:00" + result = Job.format_datetime(data) + assert expected == result + + +def test_format_datetime_conversion_fails_format_not_valid() -> None: + data = "this is not a date" + with pytest.raises(ValueError): + result = Job.format_datetime(data) + + +def test_date_diff_seconds_works() -> None: + start_date = datetime(year=2000, month=1, day=1) + end_date = datetime(year=2001, month=1, day=1) + expected = 366 * 24 * 3600 + result = Job.date_diff_sec(start_date, end_date) + assert expected == result From f4eee73d5a692d7cc42100f6287a3b363aaae606 Mon Sep 17 00:00:00 2001 From: AlejandroUPC Date: Sat, 28 May 2022 13:10:14 +0200 Subject: [PATCH 05/10] :wrench: refactor ex2_fworked_job --- jobs/examples/ex2_frameworked_job.py | 43 +++++++++++++++------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/jobs/examples/ex2_frameworked_job.py b/jobs/examples/ex2_frameworked_job.py index e8b19c24..0c45e28f 100644 --- a/jobs/examples/ex2_frameworked_job.py +++ b/jobs/examples/ex2_frameworked_job.py @@ -2,29 +2,34 @@ from pyspark.sql.functions import udf, array from pyspark.sql.types import StringType, IntegerType from pyspark.sql.functions import col +from pyspark import sql +from datetime import datetime class Job(ETL_Base): - def transform(self, some_events, other_events): + def transform(self, some_events, other_events) -> sql.DataFrame: """For demo only. Functional but no specific business logic.""" - df = self.query(""" + df = self.query( + """ SELECT se.timestamp, se.session_id, se.group, se.action FROM some_events se JOIN other_events oe on se.session_id=oe.session_id WHERE se.action='searchResultPage' and se.n_results>0 - """) + """ + ) udf_format_datetime = udf(self.format_datetime, StringType()) - events_cleaned = df \ - .withColumn('timestamp_obj', udf_format_datetime(df.timestamp).cast("timestamp")) \ - .where(col('timestamp').like("%2.016%") == False) + events_cleaned = df.withColumn( + "timestamp_obj", udf_format_datetime(df.timestamp).cast("timestamp") + ).where(col("timestamp").like("%2.016%") == False) events_cleaned.createOrReplaceTempView("events_cleaned") self.sc_sql.registerFunction("date_diff_sec", self.date_diff_sec, IntegerType()) - output = self.query(""" + output = self.query( + """ WITH session_times as ( SELECT timestamp, timestamp_obj, session_id, group, action, @@ -44,25 +49,23 @@ def transform(self, some_events, other_events): select * from session_grouped order by delta_sec desc, first_timestamp - """) + """ + ) return output @staticmethod - def format_datetime(wiki_dt): - dt = {} - dt['year'] = wiki_dt[:4] - dt['month'] = wiki_dt[4:6] - dt['day'] = wiki_dt[6:8] - dt['hour'] = wiki_dt[8:10] - dt['minute'] = wiki_dt[10:12] - dt['sec'] = wiki_dt[12:14] - return '{year}-{month}-{day} {hour}:{minute}:{sec}'.format(**dt) + def format_datetime(wiki_dt_str: str) -> str: + """ + Converts a string date to a datetime to then parse to a target format. + """ + wiki_dt = datetime.strptime(wiki_dt_str, "%Y%m%d%H%M%S") + return wiki_dt.strftime("%Y-%m-%d %H:%M:%S") @staticmethod - def date_diff_sec(x,y): - return int((y-x).total_seconds()) + def date_diff_sec(start_dt: datetime, end_dt: datetime) -> int: + return int((end_dt - start_dt).total_seconds()) if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) From 0b61cc03003076f281bc091b9ffa4f614f033363 Mon Sep 17 00:00:00 2001 From: AlejandroUPC Date: Sat, 28 May 2022 13:14:43 +0200 Subject: [PATCH 06/10] :wrench: random refactor --- jobs/examples/ex1_frameworked_job.py | 4 +++- jobs/examples/ex3_incremental_job.py | 12 +++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/jobs/examples/ex1_frameworked_job.py b/jobs/examples/ex1_frameworked_job.py index 520c7c63..0028f6d5 100644 --- a/jobs/examples/ex1_frameworked_job.py +++ b/jobs/examples/ex1_frameworked_job.py @@ -4,7 +4,9 @@ class Job(ETL_Base): - def transform(self, some_events, other_events) -> sql.DataFrame: + def transform( + self, some_events="some_events", other_events="other_events" + ) -> sql.DataFrame: return self.query( f""" SELECT se.session_id, count(*) as count_events diff --git a/jobs/examples/ex3_incremental_job.py b/jobs/examples/ex3_incremental_job.py index 2633e244..22ea3c1b 100644 --- a/jobs/examples/ex3_incremental_job.py +++ b/jobs/examples/ex3_incremental_job.py @@ -2,15 +2,17 @@ class Job(ETL_Base): - def transform(self, processed_events): - df = self.query(""" + def transform(self, processed_events="processed_events"): + df = self.query( + f""" SELECT timestamp_obj as other_timestamp, * - FROM processed_events se + FROM {processed_events} se order by timestamp_obj - """) + """ + ) return df if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) From de7ce9b62c6ddfaf2769c9f9c3b82f72fb1fa315 Mon Sep 17 00:00:00 2001 From: AlejandroUPC Date: Sat, 28 May 2022 13:20:45 +0200 Subject: [PATCH 07/10] :wrench: big commit --- jobs/examples/ex2_frameworked_job.py | 12 - jobs/examples/ex3_incremental_job.py | 4 +- jobs/examples/ex3_incremental_prep_job.py | 20 +- jobs/examples/ex4_dependency1_job.py | 17 +- yaetos/etl_utils.py | 1382 ++++++++++++++------- 5 files changed, 969 insertions(+), 466 deletions(-) diff --git a/jobs/examples/ex2_frameworked_job.py b/jobs/examples/ex2_frameworked_job.py index 0c45e28f..b5570d3f 100644 --- a/jobs/examples/ex2_frameworked_job.py +++ b/jobs/examples/ex2_frameworked_job.py @@ -53,18 +53,6 @@ def transform(self, some_events, other_events) -> sql.DataFrame: ) return output - @staticmethod - def format_datetime(wiki_dt_str: str) -> str: - """ - Converts a string date to a datetime to then parse to a target format. - """ - wiki_dt = datetime.strptime(wiki_dt_str, "%Y%m%d%H%M%S") - return wiki_dt.strftime("%Y-%m-%d %H:%M:%S") - - @staticmethod - def date_diff_sec(start_dt: datetime, end_dt: datetime) -> int: - return int((end_dt - start_dt).total_seconds()) - if __name__ == "__main__": args = {"job_param_file": "conf/jobs_metadata.yml"} diff --git a/jobs/examples/ex3_incremental_job.py b/jobs/examples/ex3_incremental_job.py index 22ea3c1b..faffae68 100644 --- a/jobs/examples/ex3_incremental_job.py +++ b/jobs/examples/ex3_incremental_job.py @@ -1,8 +1,8 @@ from yaetos.etl_utils import ETL_Base, Commandliner - +from pyspark import sql class Job(ETL_Base): - def transform(self, processed_events="processed_events"): + def transform(self, processed_events="processed_events") -> sql.DataFrame: df = self.query( f""" SELECT timestamp_obj as other_timestamp, * diff --git a/jobs/examples/ex3_incremental_prep_job.py b/jobs/examples/ex3_incremental_prep_job.py index 92d4504f..7bdb3696 100644 --- a/jobs/examples/ex3_incremental_prep_job.py +++ b/jobs/examples/ex3_incremental_prep_job.py @@ -9,23 +9,13 @@ def transform(self, some_events): udf_format_datetime = udf(self.format_datetime, StringType()) - events_cleaned = some_events \ - .withColumn('timestamp_obj', udf_format_datetime(some_events.timestamp).cast("timestamp")) \ - .where(col('timestamp').like("%2.016%") == False) + events_cleaned = some_events.withColumn( + "timestamp_obj", + udf_format_datetime(some_events.timestamp).cast("timestamp"), + ).where(col("timestamp").like("%2.016%") == False) return events_cleaned - @staticmethod - def format_datetime(wiki_dt): - dt = {} - dt['year'] = wiki_dt[:4] - dt['month'] = wiki_dt[4:6] - dt['day'] = wiki_dt[6:8] - dt['hour'] = wiki_dt[8:10] - dt['minute'] = wiki_dt[10:12] - dt['sec'] = wiki_dt[12:14] - return '{year}-{month}-{day} {hour}:{minute}:{sec}'.format(**dt) - if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/examples/ex4_dependency1_job.py b/jobs/examples/ex4_dependency1_job.py index a96b50ce..637991da 100644 --- a/jobs/examples/ex4_dependency1_job.py +++ b/jobs/examples/ex4_dependency1_job.py @@ -1,18 +1,17 @@ from yaetos.etl_utils import ETL_Base, Commandliner -from pyspark.sql.functions import udf, array -from pyspark.sql.types import StringType, IntegerType -from pyspark.sql.functions import col +from pyspark import sql class Job(ETL_Base): - def transform(self, some_events): - df = self.query(""" + def transform(self, some_events: str = "some_events") -> sql.DataFrame: + return self.query( + f""" SELECT se.session_id, length(se.session_id) as session_length - FROM some_events se - """) - return df + FROM {some_events} se + """ + ) if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/yaetos/etl_utils.py b/yaetos/etl_utils.py index f5abf481..d0350226 100644 --- a/yaetos/etl_utils.py +++ b/yaetos/etl_utils.py @@ -31,48 +31,77 @@ from yaetos.git_utils import Git_Config_Manager from yaetos.env_dispatchers import FS_Ops_Dispatcher, Cred_Ops_Dispatcher from yaetos.logger import setup_logging -logger = setup_logging('Job') + +logger = setup_logging("Job") # imports should not include any native spark libs, to enable pandas only mode (running outside docker). # User settable params below can be changed from command line or yml or job inputs. -JOBS_METADATA_FILE = 'conf/jobs_metadata.yml' -AWS_CONFIG_FILE = 'conf/aws_config.cfg' -CONNECTION_FILE = 'conf/connections.cfg' -CLUSTER_APP_FOLDER = '/home/hadoop/app/' -LOCAL_APP_FOLDER = os.environ.get('PYSPARK_AWS_ETL_HOME', '') # PYSPARK_AWS_ETL_HOME set to end with '/', TODO: rename env var to YAETOS_HOME, and check if LOCAL_APP_FOLDER and LOCAL_JOB_REPO_FOLDER are used properly in code. Saw strange cases. -LOCAL_JOB_REPO_FOLDER = os.environ.get('PYSPARK_AWS_ETL_JOBS_HOME', '') -AWS_SECRET_ID = '/yaetos/connections' -JOB_FOLDER = 'jobs/' -PACKAGES_EMR = ['com.databricks:spark-redshift_2.11:2.0.1', 'org.apache.spark:spark-avro_2.11:2.4.0', 'mysql:mysql-connector-java:8.0.22', 'org.postgresql:postgresql:42.2.18'] # necessary for reading/writing to redshift, mysql & clickhouse using spark connector. -PACKAGES_EMR_ALT = ['io.github.spark-redshift-community:spark-redshift_2.12:5.0.3', 'org.apache.spark:spark-avro_2.12:3.1.1', 'mysql:mysql-connector-java:8.0.22', 'org.postgresql:postgresql:42.2.18'] # same but compatible with spark 3. -PACKAGES_LOCAL = PACKAGES_EMR + ['com.amazonaws:aws-java-sdk-pom:1.11.760', 'org.apache.hadoop:hadoop-aws:2.7.0'] -PACKAGES_LOCAL_ALT = PACKAGES_EMR_ALT + ['com.amazonaws:aws-java-sdk-pom:1.11.760', 'org.apache.hadoop:hadoop-aws:2.7.0'] # will probably need to be moved to hadoop-aws:3.2.1 to work locally. -JARS = 'https://s3.amazonaws.com/redshift-downloads/drivers/jdbc/1.2.41.1065/RedshiftJDBC42-no-awssdk-1.2.41.1065.jar' # not available in public repo so cannot be put in "packages" var. +JOBS_METADATA_FILE = "conf/jobs_metadata.yml" +AWS_CONFIG_FILE = "conf/aws_config.cfg" +CONNECTION_FILE = "conf/connections.cfg" +CLUSTER_APP_FOLDER = "/home/hadoop/app/" +LOCAL_APP_FOLDER = os.environ.get( + "PYSPARK_AWS_ETL_HOME", "" +) # PYSPARK_AWS_ETL_HOME set to end with '/', TODO: rename env var to YAETOS_HOME, and check if LOCAL_APP_FOLDER and LOCAL_JOB_REPO_FOLDER are used properly in code. Saw strange cases. +LOCAL_JOB_REPO_FOLDER = os.environ.get("PYSPARK_AWS_ETL_JOBS_HOME", "") +AWS_SECRET_ID = "/yaetos/connections" +JOB_FOLDER = "jobs/" +PACKAGES_EMR = [ + "com.databricks:spark-redshift_2.11:2.0.1", + "org.apache.spark:spark-avro_2.11:2.4.0", + "mysql:mysql-connector-java:8.0.22", + "org.postgresql:postgresql:42.2.18", +] # necessary for reading/writing to redshift, mysql & clickhouse using spark connector. +PACKAGES_EMR_ALT = [ + "io.github.spark-redshift-community:spark-redshift_2.12:5.0.3", + "org.apache.spark:spark-avro_2.12:3.1.1", + "mysql:mysql-connector-java:8.0.22", + "org.postgresql:postgresql:42.2.18", +] # same but compatible with spark 3. +PACKAGES_LOCAL = PACKAGES_EMR + [ + "com.amazonaws:aws-java-sdk-pom:1.11.760", + "org.apache.hadoop:hadoop-aws:2.7.0", +] +PACKAGES_LOCAL_ALT = PACKAGES_EMR_ALT + [ + "com.amazonaws:aws-java-sdk-pom:1.11.760", + "org.apache.hadoop:hadoop-aws:2.7.0", +] # will probably need to be moved to hadoop-aws:3.2.1 to work locally. +JARS = "https://s3.amazonaws.com/redshift-downloads/drivers/jdbc/1.2.41.1065/RedshiftJDBC42-no-awssdk-1.2.41.1065.jar" # not available in public repo so cannot be put in "packages" var. class ETL_Base(object): - TABULAR_TYPES = ('csv', 'parquet', 'df', 'mysql', 'clickhouse') - SPARK_DF_TYPES = ('csv', 'parquet', 'df', 'mysql', 'clickhouse') - PANDAS_DF_TYPES = ('csv', 'parquet', 'df') - FILE_TYPES = ('csv', 'parquet', 'txt') - OTHER_TYPES = ('other', 'None') - SUPPORTED_TYPES = set(TABULAR_TYPES) \ - .union(set(SPARK_DF_TYPES)) \ - .union(set(PANDAS_DF_TYPES)) \ - .union(set(FILE_TYPES)) \ + TABULAR_TYPES = ("csv", "parquet", "df", "mysql", "clickhouse") + SPARK_DF_TYPES = ("csv", "parquet", "df", "mysql", "clickhouse") + PANDAS_DF_TYPES = ("csv", "parquet", "df") + FILE_TYPES = ("csv", "parquet", "txt") + OTHER_TYPES = ("other", "None") + SUPPORTED_TYPES = ( + set(TABULAR_TYPES) + .union(set(SPARK_DF_TYPES)) + .union(set(PANDAS_DF_TYPES)) + .union(set(FILE_TYPES)) .union(set(OTHER_TYPES)) + ) def __init__(self, pre_jargs={}, jargs=None, loaded_inputs={}): self.loaded_inputs = loaded_inputs self.jargs = self.set_jargs(pre_jargs, loaded_inputs) if not jargs else jargs if self.jargs.manage_git_info: - git_yml = Git_Config_Manager().get_config(mode=self.jargs.mode, local_app_folder=LOCAL_APP_FOLDER, cluster_app_folder=CLUSTER_APP_FOLDER) - [git_yml.pop(key, None) for key in ('diffs_current', 'diffs_yaetos') if git_yml] - logger.info('Git info {}'.format(git_yml)) + git_yml = Git_Config_Manager().get_config( + mode=self.jargs.mode, + local_app_folder=LOCAL_APP_FOLDER, + cluster_app_folder=CLUSTER_APP_FOLDER, + ) + [ + git_yml.pop(key, None) + for key in ("diffs_current", "diffs_yaetos") + if git_yml + ] + logger.info("Git info {}".format(git_yml)) def etl(self, sc, sc_sql): - """ Main function. If incremental, reruns ETL process multiple time until + """Main function. If incremental, reruns ETL process multiple time until fully loaded, otherwise, just run ETL once. It's a way to deal with case where full incremental rerun from scratch would require a larger cluster to build in 1 shot than the typical incremental. @@ -83,7 +112,7 @@ def etl(self, sc, sc_sql): else: output = self.etl_multi_pass(sc, sc_sql, self.loaded_inputs) except Exception as err: - if self.jargs.mode in ('prod_EMR') and self.jargs.merged_args.get('owners'): + if self.jargs.mode in ("prod_EMR") and self.jargs.merged_args.get("owners"): self.send_job_failure_email(err) raise Exception("Job failed, error: \n{}".format(err)) return output @@ -91,98 +120,144 @@ def etl(self, sc, sc_sql): def etl_multi_pass(self, sc, sc_sql, loaded_inputs={}): needs_run = True ii = 0 - while needs_run: # TODO: check to rewrite as for loop. Simpler and avoiding potential infinite loops. + while ( + needs_run + ): # TODO: check to rewrite as for loop. Simpler and avoiding potential infinite loops. # TODO: isolate code below into separate functions. - ii+=1 - if self.jargs.merged_args.get('job_increment') == 'daily': + ii += 1 + if self.jargs.merged_args.get("job_increment") == "daily": if ii == 1: - first_day = self.jargs.merged_args['first_day'] + first_day = self.jargs.merged_args["first_day"] last_run_period = self.get_last_run_period_daily(sc, sc_sql) - periods = Period_Builder().get_last_output_to_last_day(last_run_period, first_day) + periods = Period_Builder().get_last_output_to_last_day( + last_run_period, first_day + ) if len(periods) == 0: - logger.info('Output up to date. Nothing to run. last processed period={} and last period from now={}'.format(last_run_period, Period_Builder.get_last_day())) + logger.info( + "Output up to date. Nothing to run. last processed period={} and last period from now={}".format( + last_run_period, Period_Builder.get_last_day() + ) + ) output = su.create_empty_sdf(sc_sql) - self.final_inc = True # remove "self." when sandbox job doesn't depend on it. + self.final_inc = ( + True # remove "self." when sandbox job doesn't depend on it. + ) else: - logger.info('Periods remaining to load: {}'.format(periods)) + logger.info("Periods remaining to load: {}".format(periods)) period = periods[0] - logger.info('Period to be loaded in this run: {}'.format(period)) + logger.info("Period to be loaded in this run: {}".format(period)) self.period = period # to be captured in etl_one_pass, needed for in database filtering. - self.period_next = periods[1] if len(periods)>=2 else None # same - self.jargs.merged_args['file_tag'] = period + self.period_next = periods[1] if len(periods) >= 2 else None # same + self.jargs.merged_args["file_tag"] = period output = self.etl_one_pass(sc, sc_sql, loaded_inputs) self.final_inc = period == periods[-1] periods.pop(0) # for next increment. else: raise Exception("'job_increment' param has to be set to 'daily'") - if self.jargs.rerun_criteria == 'last_date': # i.e. stop when reached final increment, i.e. current period is last to process. Pb: can go in infinite loop if missing data. + if ( + self.jargs.rerun_criteria == "last_date" + ): # i.e. stop when reached final increment, i.e. current period is last to process. Pb: can go in infinite loop if missing data. needs_run = not self.final_inc - elif self.jargs.rerun_criteria == 'output_empty': # i.e. stop when current inc is empty. Good to deal with late arriving data, but will be a pb if some increment doesn't have data and will never have. + elif ( + self.jargs.rerun_criteria == "output_empty" + ): # i.e. stop when current inc is empty. Good to deal with late arriving data, but will be a pb if some increment doesn't have data and will never have. needs_run = not self.output_empty - elif self.jargs.rerun_criteria == 'both': + elif self.jargs.rerun_criteria == "both": needs_run = not (self.output_empty or self.final_inc) if needs_run: - del(output) + del output gc.collect() - logger.info('Incremental build needs other run -> {}'.format(needs_run)) + logger.info("Incremental build needs other run -> {}".format(needs_run)) # TODO: check to change output to reload all outputs from inc build return output def etl_one_pass(self, sc, sc_sql, loaded_inputs={}): - """ Main etl function, loads inputs, runs transform, and saves output.""" - logger.info("-------Starting running job '{}'--------".format(self.jargs.job_name)) + """Main etl function, loads inputs, runs transform, and saves output.""" + logger.info( + "-------Starting running job '{}'--------".format(self.jargs.job_name) + ) start_time = time() - self.start_dt = datetime.utcnow() # attached to self so available within "transform()" func. + self.start_dt = ( + datetime.utcnow() + ) # attached to self so available within "transform()" func. output, schemas = self.etl_no_io(sc, sc_sql, loaded_inputs) if output is None: if self.jargs.is_incremental: - logger.info("-------End job '{}', increment with empty output--------".format(self.jargs.job_name)) + logger.info( + "-------End job '{}', increment with empty output--------".format( + self.jargs.job_name + ) + ) self.output_empty = True else: - logger.info("-------End job '{}', no output--------".format(self.jargs.job_name)) + logger.info( + "-------End job '{}', no output--------".format(self.jargs.job_name) + ) # TODO: add process time in that case. return None - if not self.jargs.no_fw_cache or (self.jargs.is_incremental and self.jargs.rerun_criteria == 'output_empty'): - logger.info('Output sample:') + if not self.jargs.no_fw_cache or ( + self.jargs.is_incremental and self.jargs.rerun_criteria == "output_empty" + ): + logger.info("Output sample:") try: output.show() except Exception as e: - logger.info("Warning: Failed showing table sample with error '{}'.".format(e)) + logger.info( + "Warning: Failed showing table sample with error '{}'.".format(e) + ) pass count = output.count() - logger.info('Output count: {}'.format(count)) - if self.jargs.output.get('df_type', 'spark')=='spark': - logger.info("Output data types: {}".format(pformat([(fd.name, fd.dataType) for fd in output.schema.fields]))) + logger.info("Output count: {}".format(count)) + if self.jargs.output.get("df_type", "spark") == "spark": + logger.info( + "Output data types: {}".format( + pformat([(fd.name, fd.dataType) for fd in output.schema.fields]) + ) + ) self.output_empty = count == 0 self.save_output(output, self.start_dt) end_time = time() elapsed = end_time - start_time - logger.info('Process time to complete (post save to file but pre copy to db if any, also may not include processing if output not saved): {} s'.format(elapsed)) + logger.info( + "Process time to complete (post save to file but pre copy to db if any, also may not include processing if output not saved): {} s".format( + elapsed + ) + ) if self.jargs.save_schemas and schemas: schemas.save_yaml(self.jargs.job_name) # self.save_metadata(elapsed) # disable for now to avoid spark parquet reading issues. TODO: check to re-enable. - if self.jargs.merged_args.get('copy_to_redshift') and self.jargs.enable_redshift_push: - self.copy_to_redshift_using_spark(output) # to use pandas: self.copy_to_redshift_using_pandas(output, self.OUTPUT_TYPES) - if self.jargs.merged_args.get('copy_to_clickhouse') and self.jargs.enable_redshift_push: # TODO: rename enable_redshift_push to enable_db_push since not redshift here. + if ( + self.jargs.merged_args.get("copy_to_redshift") + and self.jargs.enable_redshift_push + ): + self.copy_to_redshift_using_spark( + output + ) # to use pandas: self.copy_to_redshift_using_pandas(output, self.OUTPUT_TYPES) + if ( + self.jargs.merged_args.get("copy_to_clickhouse") + and self.jargs.enable_redshift_push + ): # TODO: rename enable_redshift_push to enable_db_push since not redshift here. self.copy_to_clickhouse(output) - if self.jargs.merged_args.get('copy_to_kafka'): + if self.jargs.merged_args.get("copy_to_kafka"): self.push_to_kafka(output, self.OUTPUT_TYPES) - if self.jargs.output.get('df_type', 'spark')=='spark': + if self.jargs.output.get("df_type", "spark") == "spark": output.unpersist() end_time = time() elapsed = end_time - start_time - logger.info('Process time to complete job (post db copies if any): {} s'.format(elapsed)) + logger.info( + "Process time to complete job (post db copies if any): {} s".format(elapsed) + ) logger.info("-------End job '{}'--------".format(self.jargs.job_name)) return output def etl_no_io(self, sc, sc_sql, loaded_inputs={}, jargs=None): - """ Function to load inputs (including from live vars) and run transform. No output to disk. + """Function to load inputs (including from live vars) and run transform. No output to disk. Having this code isolated is useful for cases with no I/O possible, like testing.""" self.jargs = jargs or self.jargs self.sc = sc @@ -194,8 +269,12 @@ def etl_no_io(self, sc, sc_sql, loaded_inputs={}, jargs=None): loaded_datasets = self.load_inputs(loaded_inputs) output = self.transform(**loaded_datasets) - if output is not None and self.jargs.output['type'] in self.TABULAR_TYPES and self.jargs.output.get('df_type', 'spark')=='spark': - if self.jargs.add_created_at=='true': + if ( + output is not None + and self.jargs.output["type"] in self.TABULAR_TYPES + and self.jargs.output.get("df_type", "spark") == "spark" + ): + if self.jargs.add_created_at == "true": output = su.add_created_at(output, self.start_dt) output.cache() schemas = Schema_Builder() @@ -204,8 +283,10 @@ def etl_no_io(self, sc, sc_sql, loaded_inputs={}, jargs=None): schemas = None return output, schemas - def etl_no_io_function(self, sc, sc_sql, transform=None, transform_args={}, loaded_inputs={}): - """ Used for testing internal functions""" + def etl_no_io_function( + self, sc, sc_sql, transform=None, transform_args={}, loaded_inputs={} + ): + """Used for testing internal functions""" # self.jargs = jargs self.sc = sc self.sc_sql = sc_sql @@ -216,22 +297,35 @@ def etl_no_io_function(self, sc, sc_sql, transform=None, transform_args={}, load return output def transform(self, **app_args): - """ The function that needs to be overriden by each specific job.""" + """The function that needs to be overriden by each specific job.""" raise NotImplementedError def get_last_run_period_daily(self, sc, sc_sql): - previous_output_max_timestamp = self.get_previous_output_max_timestamp(sc, sc_sql) - last_run_period = previous_output_max_timestamp.strftime("%Y-%m-%d") if previous_output_max_timestamp else None # TODO: if get_output_max_timestamp()=None, means new build, so should delete instance in DBs. + previous_output_max_timestamp = self.get_previous_output_max_timestamp( + sc, sc_sql + ) + last_run_period = ( + previous_output_max_timestamp.strftime("%Y-%m-%d") + if previous_output_max_timestamp + else None + ) # TODO: if get_output_max_timestamp()=None, means new build, so should delete instance in DBs. return last_run_period def set_jargs(self, pre_jargs, loaded_inputs={}): - """ jargs means job args. Function called only if running the job directly, i.e. "python some_job.py""" + """jargs means job args. Function called only if running the job directly, i.e. "python some_job.py""" py_job = self.set_py_job() job_name = Job_Yml_Parser.set_job_name_from_file(py_job) - return Job_Args_Parser(defaults_args=pre_jargs['defaults_args'], yml_args=None, job_args=pre_jargs['job_args'], cmd_args=pre_jargs['cmd_args'], job_name=job_name, loaded_inputs=loaded_inputs) # set yml_args=None so loading yml is handled in Job_Args_Parser() + return Job_Args_Parser( + defaults_args=pre_jargs["defaults_args"], + yml_args=None, + job_args=pre_jargs["job_args"], + cmd_args=pre_jargs["cmd_args"], + job_name=job_name, + loaded_inputs=loaded_inputs, + ) # set yml_args=None so loading yml is handled in Job_Args_Parser() def set_py_job(self): - """ Returns the file being executed. For ex, when running "python some_job.py", this functions returns "some_job.py". + """Returns the file being executed. For ex, when running "python some_job.py", this functions returns "some_job.py". Only gives good output when the job is launched that way.""" py_job = inspect.getsourcefile(self.__class__) logger.info("py_job: '{}'".format(py_job)) @@ -244,21 +338,28 @@ def load_inputs(self, loaded_inputs): # Load from memory if available if item in loaded_inputs.keys(): app_args[item] = loaded_inputs[item] - logger.info("Input '{}' passed in memory from a previous job.".format(item)) + logger.info( + "Input '{}' passed in memory from a previous job.".format(item) + ) continue # Skip "other" types - if self.jargs.inputs[item]['type'] == "other": + if self.jargs.inputs[item]["type"] == "other": app_args[item] = None - logger.info("Input '{}' not loaded since type set to 'other'.".format(item)) + logger.info( + "Input '{}' not loaded since type set to 'other'.".format(item) + ) continue # Load from disk app_args[item] = self.load_input(item) logger.info("Input '{}' loaded.".format(item)) - if self.jargs.is_incremental and self.jargs.inputs[item]['type'] not in ('mysql', 'clickouse'): - if self.jargs.merged_args.get('motm_incremental'): + if self.jargs.is_incremental and self.jargs.inputs[item]["type"] not in ( + "mysql", + "clickouse", + ): + if self.jargs.merged_args.get("motm_incremental"): app_args = self.filter_incremental_inputs_motm(app_args) else: app_args = self.filter_incremental_inputs_period(app_args) @@ -270,104 +371,170 @@ def filter_incremental_inputs_motm(self, app_args): """Filter based on Min Of The Max (motm) of all inputs. Good to deal with late arriving data or async load but gets stuck if 1 input never has any new data arriving. Assumes increment fields are datetime.""" - min_dt = self.get_previous_output_max_timestamp(self.sc, self.sc_sql) if len(app_args.keys()) > 0 else None + min_dt = ( + self.get_previous_output_max_timestamp(self.sc, self.sc_sql) + if len(app_args.keys()) > 0 + else None + ) # Get latest timestamp in common across incremental inputs maxes = [] for item in app_args.keys(): - input_is_tabular = self.jargs.inputs[item]['type'] in self.TABULAR_TYPES and self.jargs.inputs[item]('df_type', 'spark')=='spark' - inc = self.jargs.inputs[item].get('inc_field', None) + input_is_tabular = ( + self.jargs.inputs[item]["type"] in self.TABULAR_TYPES + and self.jargs.inputs[item]("df_type", "spark") == "spark" + ) + inc = self.jargs.inputs[item].get("inc_field", None) if input_is_tabular and inc: max_dt = app_args[item].agg({inc: "max"}).collect()[0][0] maxes.append(max_dt) - max_dt = min(maxes) if len(maxes)>0 else None + max_dt = min(maxes) if len(maxes) > 0 else None # Filter for item in app_args.keys(): - input_is_tabular = self.jargs.inputs[item]['type'] in self.TABULAR_TYPES and self.jargs.inputs[item]('df_type', 'spark')=='spark' - inc = self.jargs.inputs[item].get('inc_field', None) + input_is_tabular = ( + self.jargs.inputs[item]["type"] in self.TABULAR_TYPES + and self.jargs.inputs[item]("df_type", "spark") == "spark" + ) + inc = self.jargs.inputs[item].get("inc_field", None) if inc: if input_is_tabular: # TODO: add limit to amount of input data, and set self.final_inc=False - inc_type = {k:v for k, v in app_args[item].dtypes}[inc] - logger.info("Input dataset '{}' will be filtered for min_dt={} max_dt={}".format(item, min_dt, max_dt)) + inc_type = {k: v for k, v in app_args[item].dtypes}[inc] + logger.info( + "Input dataset '{}' will be filtered for min_dt={} max_dt={}".format( + item, min_dt, max_dt + ) + ) if min_dt: # min_dt = to_date(lit(s)).cast(TimestampType() # TODO: deal with dt type, as coming from parquet - app_args[item] = app_args[item].filter(app_args[item][inc] > min_dt) + app_args[item] = app_args[item].filter( + app_args[item][inc] > min_dt + ) if max_dt: - app_args[item] = app_args[item].filter(app_args[item][inc] <= max_dt) + app_args[item] = app_args[item].filter( + app_args[item][inc] <= max_dt + ) else: - raise Exception("Incremental loading is not supported for unstructured input. You need to handle the incremental logic in the job code.") + raise Exception( + "Incremental loading is not supported for unstructured input. You need to handle the incremental logic in the job code." + ) return app_args def filter_incremental_inputs_period(self, app_args): """Filter based on period defined in. Simple but can be a pb if late arriving data or dependencies not run. Inputs filtered inside source database will be filtered again.""" for item in app_args.keys(): - input_is_tabular = self.jargs.inputs[item]['type'] in self.TABULAR_TYPES and self.jargs.inputs[item]('df_type', 'spark')=='spark' - inc = self.jargs.inputs[item].get('inc_field', None) + input_is_tabular = ( + self.jargs.inputs[item]["type"] in self.TABULAR_TYPES + and self.jargs.inputs[item]("df_type", "spark") == "spark" + ) + inc = self.jargs.inputs[item].get("inc_field", None) if inc: if input_is_tabular: # TODO: add limit to amount of input data, and set self.final_inc=False - inc_type = {k:v for k, v in app_args[item].dtypes}[inc] - logger.info("Input dataset '{}' will be filtered for {}='{}'".format(item, inc, self.period)) - app_args[item] = app_args[item].filter(app_args[item][inc] == self.period) + inc_type = {k: v for k, v in app_args[item].dtypes}[inc] + logger.info( + "Input dataset '{}' will be filtered for {}='{}'".format( + item, inc, self.period + ) + ) + app_args[item] = app_args[item].filter( + app_args[item][inc] == self.period + ) else: - raise Exception("Incremental loading is not supported for unstructured input. You need to handle the incremental logic in the job code.") + raise Exception( + "Incremental loading is not supported for unstructured input. You need to handle the incremental logic in the job code." + ) return app_args def sql_register(self, app_args): for item in app_args.keys(): - input_is_tabular = hasattr(app_args[item], "rdd") # assuming DataFrame will keep 'rdd' attribute + input_is_tabular = hasattr( + app_args[item], "rdd" + ) # assuming DataFrame will keep 'rdd' attribute # ^ better than using self.jargs.inputs[item]['type'] in self.TABULAR_TYPES since doesn't require 'type' being defined. if input_is_tabular: app_args[item].createOrReplaceTempView(item) def load_input(self, input_name): - input_type = self.jargs.inputs[input_name]['type'] + input_type = self.jargs.inputs[input_name]["type"] if input_type in self.FILE_TYPES: - path = self.jargs.inputs[input_name]['path'] - path = path.replace('s3://', 's3a://') if self.jargs.mode == 'dev_local' else path - logger.info("Input '{}' to be loaded from files '{}'.".format(input_name, path)) - path = Path_Handler(path, self.jargs.base_path).expand_later(self.jargs.storage) + path = self.jargs.inputs[input_name]["path"] + path = ( + path.replace("s3://", "s3a://") + if self.jargs.mode == "dev_local" + else path + ) + logger.info( + "Input '{}' to be loaded from files '{}'.".format(input_name, path) + ) + path = Path_Handler(path, self.jargs.base_path).expand_later( + self.jargs.storage + ) # Unstructured type - if input_type == 'txt': + if input_type == "txt": rdd = self.sc.textFile(path) logger.info("Input '{}' loaded from files '{}'.".format(input_name, path)) return rdd # Tabular, Pandas - if self.jargs.inputs[input_name].get('df_type') == 'pandas': - if input_type == 'csv': - pdf = FS_Ops_Dispatcher().load_pandas(path, file_type='csv', read_func='read_csv', read_kwargs=self.jargs.inputs[input_name].get('read_kwargs',{})) - elif input_type == 'parquet': - pdf = FS_Ops_Dispatcher().load_pandas(path, file_type='parquet', read_func='read_parquet', read_kwargs=self.jargs.inputs[input_name].get('read_kwargs',{})) + if self.jargs.inputs[input_name].get("df_type") == "pandas": + if input_type == "csv": + pdf = FS_Ops_Dispatcher().load_pandas( + path, + file_type="csv", + read_func="read_csv", + read_kwargs=self.jargs.inputs[input_name].get("read_kwargs", {}), + ) + elif input_type == "parquet": + pdf = FS_Ops_Dispatcher().load_pandas( + path, + file_type="parquet", + read_func="read_parquet", + read_kwargs=self.jargs.inputs[input_name].get("read_kwargs", {}), + ) else: - raise Exception("Unsupported input type '{}' for path '{}'. Supported types for pandas are: {}. ".format(input_type, self.jargs.inputs[input_name].get('path'), self.PANDAS_DF_TYPES)) + raise Exception( + "Unsupported input type '{}' for path '{}'. Supported types for pandas are: {}. ".format( + input_type, + self.jargs.inputs[input_name].get("path"), + self.PANDAS_DF_TYPES, + ) + ) logger.info("Input '{}' loaded from files '{}'.".format(input_name, path)) # logger.info("Input data types: {}".format(pformat([(fd.name, fd.dataType) for fd in sdf.schema.fields]))) # TODO adapt to pandas return pdf - # Tabular types, Spark - if input_type == 'csv': - delimiter = self.jargs.merged_args.get('csv_delimiter', ',') + if input_type == "csv": + delimiter = self.jargs.merged_args.get("csv_delimiter", ",") sdf = self.sc_sql.read.option("delimiter", delimiter).csv(path, header=True) logger.info("Input '{}' loaded from files '{}'.".format(input_name, path)) - elif input_type == 'parquet': + elif input_type == "parquet": sdf = self.sc_sql.read.parquet(path) logger.info("Input '{}' loaded from files '{}'.".format(input_name, path)) - elif input_type == 'mysql': + elif input_type == "mysql": sdf = self.load_mysql(input_name) logger.info("Input '{}' loaded from mysql".format(input_name)) - elif input_type == 'clickhouse': + elif input_type == "clickhouse": sdf = self.load_clickhouse(input_name) logger.info("Input '{}' loaded from clickhouse".format(input_name)) else: - raise Exception("Unsupported input type '{}' for path '{}'. Supported types are: {}. ".format(input_type, self.jargs.inputs[input_name].get('path'), self.SUPPORTED_TYPES)) - - logger.info("Input data types: {}".format(pformat([(fd.name, fd.dataType) for fd in sdf.schema.fields]))) + raise Exception( + "Unsupported input type '{}' for path '{}'. Supported types are: {}. ".format( + input_type, + self.jargs.inputs[input_name].get("path"), + self.SUPPORTED_TYPES, + ) + ) + + logger.info( + "Input data types: {}".format( + pformat([(fd.name, fd.dataType) for fd in sdf.schema.fields]) + ) + ) return sdf def load_data_from_files(self, name, path, type, sc, sc_sql): @@ -376,25 +543,35 @@ def load_data_from_files(self, name, path, type, sc, sc_sql): # TODO: integrate with load_input to remove duplicated code. input_type = type input_name = name - path = path.replace('s3://', 's3a://') if self.jargs.mode == 'dev_local' else path - logger.info("Dataset '{}' to be loaded from files '{}'.".format(input_name, path)) + path = ( + path.replace("s3://", "s3a://") if self.jargs.mode == "dev_local" else path + ) + logger.info( + "Dataset '{}' to be loaded from files '{}'.".format(input_name, path) + ) path = Path_Handler(path, self.jargs.base_path).expand_later(self.jargs.storage) - if input_type == 'txt': + if input_type == "txt": rdd = self.sc.textFile(path) logger.info("Dataset '{}' loaded from files '{}'.".format(input_name, path)) return rdd # Tabular types - if input_type == 'csv': - sdf = sc_sql.read.csv(path, header=True) # TODO: add way to add .option("delimiter", ';'), useful for metric_budgeting. + if input_type == "csv": + sdf = sc_sql.read.csv( + path, header=True + ) # TODO: add way to add .option("delimiter", ';'), useful for metric_budgeting. logger.info("Dataset '{}' loaded from files '{}'.".format(input_name, path)) - elif input_type == 'parquet': + elif input_type == "parquet": # TODO: check to add ...read.option("mergeSchema", "true").parquet... sdf = sc_sql.read.parquet(path) logger.info("Dataset '{}' loaded from files '{}'.".format(input_name, path)) else: - raise Exception("Unsupported dataset type '{}' for path '{}'. Supported types are: {}. ".format(input_type, path, self.SUPPORTED_TYPES)) + raise Exception( + "Unsupported dataset type '{}' for path '{}'. Supported types are: {}. ".format( + input_type, path, self.SUPPORTED_TYPES + ) + ) # New param "custom_schema" to work for both db and file inputs (instead of just db). TODO: finish. # df_custom_schema = self.jargs.merged_args.get('df_custom_schema') @@ -402,92 +579,144 @@ def load_data_from_files(self, name, path, type, sc, sc_sql): # for field, type in df_custom_schema.items(): # table_to_copy = table_to_copy.withColumn(field, table_to_copy[field].cast(type)) - logger.info("Dataset data types: {}".format(pformat([(fd.name, fd.dataType) for fd in sdf.schema.fields]))) + logger.info( + "Dataset data types: {}".format( + pformat([(fd.name, fd.dataType) for fd in sdf.schema.fields]) + ) + ) return sdf def load_mysql(self, input_name): - creds = Cred_Ops_Dispatcher().retrieve_secrets(self.jargs.storage, creds=self.jargs.connection_file) - creds_section = self.jargs.inputs[input_name]['creds'] + creds = Cred_Ops_Dispatcher().retrieve_secrets( + self.jargs.storage, creds=self.jargs.connection_file + ) + creds_section = self.jargs.inputs[input_name]["creds"] db = creds[creds_section] - extra_params = '' # can use '?zeroDateTimeBehavior=CONVERT_TO_NULL' to help solve "java.sql.SQLException: Zero date value prohibited" but leads to other error msg. - url = 'jdbc:mysql://{host}:{port}/{service}{extra_params}'.format(host=db['host'], port=db['port'], service=db['service'], extra_params=extra_params) - dbtable = self.jargs.inputs[input_name]['db_table'] - inc_field = self.jargs.inputs[input_name].get('inc_field') + extra_params = "" # can use '?zeroDateTimeBehavior=CONVERT_TO_NULL' to help solve "java.sql.SQLException: Zero date value prohibited" but leads to other error msg. + url = "jdbc:mysql://{host}:{port}/{service}{extra_params}".format( + host=db["host"], + port=db["port"], + service=db["service"], + extra_params=extra_params, + ) + dbtable = self.jargs.inputs[input_name]["db_table"] + inc_field = self.jargs.inputs[input_name].get("inc_field") if not inc_field: logger.info('Pulling table "{}" from mysql'.format(dbtable)) - sdf = self.sc_sql.read \ - .format('jdbc') \ - .option('driver', "com.mysql.cj.jdbc.Driver") \ - .option("url", url) \ - .option("user", db['user']) \ - .option("password", db['password']) \ - .option("dbtable", dbtable)\ + sdf = ( + self.sc_sql.read.format("jdbc") + .option("driver", "com.mysql.cj.jdbc.Driver") + .option("url", url) + .option("user", db["user"]) + .option("password", db["password"]) + .option("dbtable", dbtable) .load() + ) else: - inc_field = self.jargs.inputs[input_name]['inc_field'] + inc_field = self.jargs.inputs[input_name]["inc_field"] period = self.period # query_str = "select * from {} where {} = '{}'".format(dbtable, inc_field, period) - higher_limit = "AND {inc_field} < '{period_next}'".format(inc_field=inc_field, period_next=self.period_next) if self.period_next else '' - query_str = "select * from {dbtable} where {inc_field} >= '{period}' {higher_limit}".format(dbtable=dbtable, inc_field=inc_field, period=self.period, higher_limit=higher_limit) - logger.info('Pulling table from mysql with query_str "{}"'.format(query_str)) + higher_limit = ( + "AND {inc_field} < '{period_next}'".format( + inc_field=inc_field, period_next=self.period_next + ) + if self.period_next + else "" + ) + query_str = "select * from {dbtable} where {inc_field} >= '{period}' {higher_limit}".format( + dbtable=dbtable, + inc_field=inc_field, + period=self.period, + higher_limit=higher_limit, + ) + logger.info( + 'Pulling table from mysql with query_str "{}"'.format(query_str) + ) # if self.jargs.merged_args.get('custom_schema', '') # db_overridden_types_str = ', '.join([k + ' ' + v for k, v in db_overridden_types.items()]) # TODO: check if it should use com.mysql.cj.jdbc.Driver instead as above - sdf = self.sc_sql.read \ - .format('jdbc') \ - .option('driver', "com.mysql.jdbc.Driver") \ - .option('fetchsize', 10000) \ - .option('numPartitions', 3) \ - .option("url", url) \ - .option("user", db['user']) \ - .option("password", db['password']) \ - .option("customSchema", self.jargs.merged_args.get('jdbc_custom_schema', '')) \ - .option("query", query_str) \ + sdf = ( + self.sc_sql.read.format("jdbc") + .option("driver", "com.mysql.jdbc.Driver") + .option("fetchsize", 10000) + .option("numPartitions", 3) + .option("url", url) + .option("user", db["user"]) + .option("password", db["password"]) + .option( + "customSchema", self.jargs.merged_args.get("jdbc_custom_schema", "") + ) + .option("query", query_str) .load() + ) return sdf def load_clickhouse(self, input_name): - creds = Cred_Ops_Dispatcher().retrieve_secrets(self.jargs.storage, creds=self.jargs.connection_file) - creds_section = self.jargs.inputs[input_name]['creds'] + creds = Cred_Ops_Dispatcher().retrieve_secrets( + self.jargs.storage, creds=self.jargs.connection_file + ) + creds_section = self.jargs.inputs[input_name]["creds"] db = creds[creds_section] - url = 'jdbc:postgresql://{host}/{service}'.format(host=db['host'], service=db['service']) - dbtable = self.jargs.inputs[input_name]['db_table'] - inc_field = self.jargs.inputs[input_name].get('inc_field') + url = "jdbc:postgresql://{host}/{service}".format( + host=db["host"], service=db["service"] + ) + dbtable = self.jargs.inputs[input_name]["db_table"] + inc_field = self.jargs.inputs[input_name].get("inc_field") if not inc_field: logger.info('Pulling table "{}" from Clickhouse'.format(dbtable)) - sdf = self.sc_sql.read \ - .format('jdbc') \ - .option('driver', "org.postgresql.Driver") \ - .option("url", url) \ - .option("user", db['user']) \ - .option("password", db['password']) \ - .option("dbtable", dbtable)\ + sdf = ( + self.sc_sql.read.format("jdbc") + .option("driver", "org.postgresql.Driver") + .option("url", url) + .option("user", db["user"]) + .option("password", db["password"]) + .option("dbtable", dbtable) .load() + ) else: - inc_field = self.jargs.inputs[input_name]['inc_field'] + inc_field = self.jargs.inputs[input_name]["inc_field"] period = self.period - query_str = "select * from {} where {} = '{}'".format(dbtable, inc_field, period) - logger.info('Pulling table from Clickhouse with query_str "{}"'.format(query_str)) - sdf = self.sc_sql.read \ - .format('jdbc') \ - .option('driver', "org.postgresql.Driver") \ - .option('fetchsize', 10000) \ - .option('numPartitions', 3) \ - .option("url", url) \ - .option("user", db['user']) \ - .option("password", db['password']) \ - .option("query", query_str) \ + query_str = "select * from {} where {} = '{}'".format( + dbtable, inc_field, period + ) + logger.info( + 'Pulling table from Clickhouse with query_str "{}"'.format(query_str) + ) + sdf = ( + self.sc_sql.read.format("jdbc") + .option("driver", "org.postgresql.Driver") + .option("fetchsize", 10000) + .option("numPartitions", 3) + .option("url", url) + .option("user", db["user"]) + .option("password", db["password"]) + .option("query", query_str) .load() + ) return sdf def get_previous_output_max_timestamp(self, sc, sc_sql): - path = self.jargs.output['path'] # implies output path is incremental (no "{now}" in string.) - path += '*' if self.jargs.merged_args.get('incremental_type') == 'no_schema' else '' # '*' to go into output subfolders. + path = self.jargs.output[ + "path" + ] # implies output path is incremental (no "{now}" in string.) + path += ( + "*" if self.jargs.merged_args.get("incremental_type") == "no_schema" else "" + ) # '*' to go into output subfolders. try: - df = self.load_data_from_files(name='output', path=path, type=self.jargs.output['type'], sc=sc, sc_sql=sc_sql) + df = self.load_data_from_files( + name="output", + path=path, + type=self.jargs.output["type"], + sc=sc, + sc_sql=sc_sql, + ) except Exception as e: # TODO: don't catch all - logger.info("Previous increment could not be loaded or doesn't exist. It will be ignored. Folder '{}' failed loading with error '{}'.".format(path, e)) + logger.info( + "Previous increment could not be loaded or doesn't exist. It will be ignored. Folder '{}' failed loading with error '{}'.".format( + path, e + ) + ) return None dt = self.get_max_timestamp(df) @@ -495,62 +724,100 @@ def get_previous_output_max_timestamp(self, sc, sc_sql): return dt def get_max_timestamp(self, df): - return df.agg({self.jargs.output['inc_field']: "max"}).collect()[0][0] + return df.agg({self.jargs.output["inc_field"]: "max"}).collect()[0][0] def save_output(self, output, now_dt=None): - self.path = self.save(output=output, - path=self.jargs.output['path'], - base_path=self.jargs.base_path, - type=self.jargs.output['type'], - now_dt=now_dt, - is_incremental=self.jargs.is_incremental, - incremental_type=self.jargs.merged_args.get('incremental_type', 'no_schema'), - partitionby=self.jargs.output.get('inc_field') or self.jargs.merged_args.get('partitionby'), - file_tag=self.jargs.merged_args.get('file_tag')) # TODO: make param standard in cmd_args ? - - def save(self, output, path, base_path, type, now_dt=None, is_incremental=None, incremental_type=None, partitionby=None, file_tag=None): + self.path = self.save( + output=output, + path=self.jargs.output["path"], + base_path=self.jargs.base_path, + type=self.jargs.output["type"], + now_dt=now_dt, + is_incremental=self.jargs.is_incremental, + incremental_type=self.jargs.merged_args.get( + "incremental_type", "no_schema" + ), + partitionby=self.jargs.output.get("inc_field") + or self.jargs.merged_args.get("partitionby"), + file_tag=self.jargs.merged_args.get("file_tag"), + ) # TODO: make param standard in cmd_args ? + + def save( + self, + output, + path, + base_path, + type, + now_dt=None, + is_incremental=None, + incremental_type=None, + partitionby=None, + file_tag=None, + ): """Used to save output to disk. Can be used too inside jobs to output 2nd output for testing.""" path = Path_Handler(path, base_path).expand_now(now_dt) - if type == 'None': - logger.info('Did not write output to disk') + if type == "None": + logger.info("Did not write output to disk") return None - if is_incremental and incremental_type == 'no_schema': - current_time = now_dt.strftime('%Y%m%d_%H%M%S_utc') # no use of now_dt to make it updated for each inc. - file_tag = ('_' + file_tag) if file_tag else "" # TODO: make that param standard in cmd_args ? - path += 'inc_{}{}/'.format(current_time, file_tag) + if is_incremental and incremental_type == "no_schema": + current_time = now_dt.strftime( + "%Y%m%d_%H%M%S_utc" + ) # no use of now_dt to make it updated for each inc. + file_tag = ( + ("_" + file_tag) if file_tag else "" + ) # TODO: make that param standard in cmd_args ? + path += "inc_{}{}/".format(current_time, file_tag) # TODO: rename 'partitioned' to 'spark_partitions' and 'no_schema' to 'yaetos_partitions' - write_mode = 'append' if incremental_type == 'partitioned' or partitionby else 'error' - partitionby = partitionby.split(',') if partitionby else [] + write_mode = ( + "append" if incremental_type == "partitioned" or partitionby else "error" + ) + partitionby = partitionby.split(",") if partitionby else [] # Tabular, Pandas - if self.jargs.output.get('df_type') == 'pandas': - if type == 'csv': - FS_Ops_Dispatcher().save_pandas(output, path, save_method='to_csv', save_kwargs=self.jargs.output.get('save_kwargs',{})) - elif type == 'parquet': - FS_Ops_Dispatcher().save_pandas(output, path, save_method='to_parquet', save_kwargs=self.jargs.output.get('save_kwargs',{})) + if self.jargs.output.get("df_type") == "pandas": + if type == "csv": + FS_Ops_Dispatcher().save_pandas( + output, + path, + save_method="to_csv", + save_kwargs=self.jargs.output.get("save_kwargs", {}), + ) + elif type == "parquet": + FS_Ops_Dispatcher().save_pandas( + output, + path, + save_method="to_parquet", + save_kwargs=self.jargs.output.get("save_kwargs", {}), + ) else: - raise Exception("Need to specify supported output type for pandas, csv only for now.") - logger.info('Wrote output to ' + path) + raise Exception( + "Need to specify supported output type for pandas, csv only for now." + ) + logger.info("Wrote output to " + path) return path # TODO: deal with cases where "output" is df when expecting rdd, or at least raise issue in a cleaner way. - if type == 'txt': + if type == "txt": output.saveAsTextFile(path) - elif type == 'parquet': + elif type == "parquet": output.write.partitionBy(*partitionby).mode(write_mode).parquet(path) - elif type == 'csv': - output.write.partitionBy(*partitionby).mode(write_mode).option("header", "true").csv(path) + elif type == "csv": + output.write.partitionBy(*partitionby).mode(write_mode).option( + "header", "true" + ).csv(path) else: - raise Exception("Need to specify supported output type, either txt, parquet or csv.") + raise Exception( + "Need to specify supported output type, either txt, parquet or csv." + ) - logger.info('Wrote output to ' + path) + logger.info("Wrote output to " + path) return path def save_metadata(self, elapsed): - fname = self.path + '_metadata.txt' + fname = self.path + "_metadata.txt" content = """ -- app_name: %s -- job_name: %s @@ -560,12 +827,16 @@ def save_metadata(self, elapsed): -- output folder : TBD -- github hash: TBD -- code: TBD - """%(self.app_name, self.jargs.job_name, elapsed) + """ % ( + self.app_name, + self.jargs.job_name, + elapsed, + ) FS_Ops_Dispatcher().save_metadata(fname, content) def query(self, query_str): - logger.info('Query string:\n' + query_str) - df = self.sc_sql.sql(query_str) + logger.info("Query string:\n" + query_str) + df = self.sc_sql.sql(query_str) df.cache() return df @@ -573,77 +844,140 @@ def copy_to_redshift_using_pandas(self, output, types): # import put here below to avoid loading heavy libraries when not needed (optional feature). from yaetos.redshift_pandas import create_table from yaetos.db_utils import cast_col + df = output.toPandas() df = cast_col(df, types) - connection_profile = self.jargs.copy_to_redshift['creds'] - schema, name_tb = self.jargs.copy_to_redshift['table'].split('.') - schema = schema.format(schema=self.jargs.schema) if '{schema}' in schema else schema - creds = Cred_Ops_Dispatcher().retrieve_secrets(self.jargs.storage, creds=self.jargs.connection_file) - create_table(df, connection_profile, name_tb, schema, types, creds, self.jargs.is_incremental) - del(df) + connection_profile = self.jargs.copy_to_redshift["creds"] + schema, name_tb = self.jargs.copy_to_redshift["table"].split(".") + schema = ( + schema.format(schema=self.jargs.schema) if "{schema}" in schema else schema + ) + creds = Cred_Ops_Dispatcher().retrieve_secrets( + self.jargs.storage, creds=self.jargs.connection_file + ) + create_table( + df, + connection_profile, + name_tb, + schema, + types, + creds, + self.jargs.is_incremental, + ) + del df def copy_to_redshift_using_spark(self, sdf): # import put here below to avoid loading heavy libraries when not needed (optional feature). from yaetos.redshift_spark import create_table - connection_profile = self.jargs.copy_to_redshift['creds'] - schema, name_tb= self.jargs.copy_to_redshift['table'].split('.') - schema = schema.format(schema=self.jargs.schema) if '{schema}' in schema else schema - creds = Cred_Ops_Dispatcher().retrieve_secrets(self.jargs.storage, creds=self.jargs.connection_file) - create_table(sdf, connection_profile, name_tb, schema, creds, self.jargs.is_incremental, self.jargs.redshift_s3_tmp_dir, self.jargs.merged_args.get('spark_version', '2.4')) + + connection_profile = self.jargs.copy_to_redshift["creds"] + schema, name_tb = self.jargs.copy_to_redshift["table"].split(".") + schema = ( + schema.format(schema=self.jargs.schema) if "{schema}" in schema else schema + ) + creds = Cred_Ops_Dispatcher().retrieve_secrets( + self.jargs.storage, creds=self.jargs.connection_file + ) + create_table( + sdf, + connection_profile, + name_tb, + schema, + creds, + self.jargs.is_incremental, + self.jargs.redshift_s3_tmp_dir, + self.jargs.merged_args.get("spark_version", "2.4"), + ) def copy_to_clickhouse(self, sdf): # import put here below to avoid loading heavy libraries when not needed (optional feature). from yaetos.clickhouse import create_table - connection_profile = self.jargs.copy_to_clickhouse['creds'] - schema, name_tb= self.jargs.copy_to_clickhouse['table'].split('.') - schema = schema.format(schema=self.jargs.schema) if '{schema}' in schema else schema - creds = Cred_Ops_Dispatcher().retrieve_secrets(self.jargs.storage, creds=self.jargs.connection_file) - create_table(sdf, connection_profile, name_tb, schema, creds, self.jargs.is_incremental) + + connection_profile = self.jargs.copy_to_clickhouse["creds"] + schema, name_tb = self.jargs.copy_to_clickhouse["table"].split(".") + schema = ( + schema.format(schema=self.jargs.schema) if "{schema}" in schema else schema + ) + creds = Cred_Ops_Dispatcher().retrieve_secrets( + self.jargs.storage, creds=self.jargs.connection_file + ) + create_table( + sdf, connection_profile, name_tb, schema, creds, self.jargs.is_incremental + ) def push_to_kafka(self, output, types): - """ Needs to be overriden by each specific job.""" + """Needs to be overriden by each specific job.""" raise NotImplementedError def send_msg(self, msg, recipients=None): - """ Sending message to recipients (list of email addresse) or, if not specified, to yml 'owners'. + """Sending message to recipients (list of email addresse) or, if not specified, to yml 'owners'. Pulling email sender account info from connection_file.""" if not recipients: - recipients = self.jargs.merged_args.get('owners') + recipients = self.jargs.merged_args.get("owners") if not recipients: - logger.error("Email can't be sent since no recipient set in {}, .\nMessage : \n{}".format(self.jargs.job_param_file, msg)) + logger.error( + "Email can't be sent since no recipient set in {}, .\nMessage : \n{}".format( + self.jargs.job_param_file, msg + ) + ) return None - creds = Cred_Ops_Dispatcher().retrieve_secrets(self.jargs.storage, creds=self.jargs.connection_file) + creds = Cred_Ops_Dispatcher().retrieve_secrets( + self.jargs.storage, creds=self.jargs.connection_file + ) creds_section = self.jargs.email_cred_section - sender_email = creds.get(creds_section, 'sender_email') - password = creds.get(creds_section, 'password') - smtp_server = creds.get(creds_section, 'smtp_server') - port = creds.get(creds_section, 'port') + sender_email = creds.get(creds_section, "sender_email") + password = creds.get(creds_section, "password") + smtp_server = creds.get(creds_section, "smtp_server") + port = creds.get(creds_section, "port") for recipient in recipients: send_email(msg, recipient, sender_email, password, smtp_server, port) - logger.info('Email sent to {}'.format(recipient)) + logger.info("Email sent to {}".format(recipient)) def send_job_failure_email(self, error_msg): - message = """Subject: [Data Pipeline Failure] {name}\n\nA Data pipeline named '{name}' failed.\nError message:\n{error}\n\nPlease check logs in AWS.""".format(name=self.jargs.job_name, error=error_msg) + message = """Subject: [Data Pipeline Failure] {name}\n\nA Data pipeline named '{name}' failed.\nError message:\n{error}\n\nPlease check logs in AWS.""".format( + name=self.jargs.job_name, error=error_msg + ) self.send_msg(message) + @staticmethod + def format_datetime(wiki_dt_str: str) -> str: + """ + Converts a string date to a datetime to then parse to a target format. + """ + wiki_dt = datetime.strptime(wiki_dt_str, "%Y%m%d%H%M%S") + return wiki_dt.strftime("%Y-%m-%d %H:%M:%S") + + @staticmethod + def date_diff_sec(start_dt: datetime, end_dt: datetime) -> int: + return int((end_dt - start_dt).total_seconds()) + @staticmethod def check_pk(df, pks): count = df.count() count_pk = df.select(pks).dropDuplicates().count() if count != count_pk: - logger.error("Given fields ({}) are not PKs since not unique. count={}, count_pk={}".format(pks, count, count_pk)) + logger.error( + "Given fields ({}) are not PKs since not unique. count={}, count_pk={}".format( + pks, count, count_pk + ) + ) return False else: - logger.info("Given fields ({}) are PKs (i.e. unique). count=count_pk={}".format(pks, count)) + logger.info( + "Given fields ({}) are PKs (i.e. unique). count=count_pk={}".format( + pks, count + ) + ) return True def identify_non_unique_pks(self, df, pks): return su.identify_non_unique_pks(df, pks) -class Period_Builder(): + +class Period_Builder: @staticmethod def get_last_day(as_of_date=datetime.utcnow()): last_day_dt = as_of_date + relativedelta(days=-1) @@ -664,7 +998,9 @@ def get_first_to_last_day(first_day, as_of_date=datetime.utcnow()): iter_days = iter_days + relativedelta(days=+1) return periods - def get_last_output_to_last_day(self, last_run_period, first_day_input, as_of_date=datetime.utcnow()): + def get_last_output_to_last_day( + self, last_run_period, first_day_input, as_of_date=datetime.utcnow() + ): periods = self.get_first_to_last_day(first_day_input, as_of_date) if last_run_period: periods = [item for item in periods if item > last_run_period] @@ -672,50 +1008,59 @@ def get_last_output_to_last_day(self, last_run_period, first_day_input, as_of_da return periods -class Schema_Builder(): - TYPES_FOLDER = 'schemas/' +class Schema_Builder: + TYPES_FOLDER = "schemas/" + def generate_schemas(self, loaded_datasets, output): - yml = {'inputs':{}} + yml = {"inputs": {}} for key, value in loaded_datasets.items(): if value: # TODO: make it fail softly in case code below fails, so it doesn't block job, since it is for logging only. - yml['inputs'][key] = {fd.name: fd.dataType.__str__() for fd in value.schema.fields} - yml['output'] = {fd.name: fd.dataType.__str__() for fd in output.schema.fields} + yml["inputs"][key] = { + fd.name: fd.dataType.__str__() for fd in value.schema.fields + } + yml["output"] = {fd.name: fd.dataType.__str__() for fd in output.schema.fields} self.yml = yml def save_yaml(self, job_name): - job_name = job_name.replace('.py', '') - fname = self.TYPES_FOLDER + job_name+'.yaml' + job_name = job_name.replace(".py", "") + fname = self.TYPES_FOLDER + job_name + ".yaml" os.makedirs(os.path.dirname(fname), exist_ok=True) - with open(fname, 'w') as file: + with open(fname, "w") as file: ignored = yaml.dump(self.yml, file) -class Job_Yml_Parser(): +class Job_Yml_Parser: """Functions to load and parse yml, and functions to get job_name, which is the key to the yml info.""" def __init__(self, job_name, job_param_file, mode, skip_job=False): self.yml_args = self.set_job_yml(job_name, job_param_file, mode, skip_job) - self.yml_args['job_name'] = job_name - self.yml_args['py_job'] = self.yml_args.get('py_job') or self.set_py_job_from_name(job_name) - self.yml_args['sql_file'] = self.set_sql_file_from_name(job_name, mode) + self.yml_args["job_name"] = job_name + self.yml_args["py_job"] = self.yml_args.get( + "py_job" + ) or self.set_py_job_from_name(job_name) + self.yml_args["sql_file"] = self.set_sql_file_from_name(job_name, mode) @staticmethod def set_job_name_from_file(job_file): # when run from Flow(), job_file is full path. When run from ETL directly, job_file is "jobs/..." . - if job_file.startswith(CLUSTER_APP_FOLDER+'jobs/'): - job_name = job_file[len(CLUSTER_APP_FOLDER+'jobs/'):] - elif job_file.startswith(CLUSTER_APP_FOLDER+'scripts.zip/jobs/'): - job_name = job_file[len(CLUSTER_APP_FOLDER+'scripts.zip/jobs/'):] - elif job_file.startswith(LOCAL_APP_FOLDER+'jobs/'): - job_name = job_file[len(LOCAL_APP_FOLDER+'jobs/'):] - elif job_file.startswith(LOCAL_JOB_REPO_FOLDER+'jobs/'): # when run from external repo. - job_name = job_file[len(LOCAL_JOB_REPO_FOLDER+'jobs/'):] - elif job_file.startswith('jobs/'): - job_name = job_file[len('jobs/'):] - elif job_file.__contains__('/scripts.zip/jobs/'): + if job_file.startswith(CLUSTER_APP_FOLDER + "jobs/"): + job_name = job_file[len(CLUSTER_APP_FOLDER + "jobs/") :] + elif job_file.startswith(CLUSTER_APP_FOLDER + "scripts.zip/jobs/"): + job_name = job_file[len(CLUSTER_APP_FOLDER + "scripts.zip/jobs/") :] + elif job_file.startswith(LOCAL_APP_FOLDER + "jobs/"): + job_name = job_file[len(LOCAL_APP_FOLDER + "jobs/") :] + elif job_file.startswith( + LOCAL_JOB_REPO_FOLDER + "jobs/" + ): # when run from external repo. + job_name = job_file[len(LOCAL_JOB_REPO_FOLDER + "jobs/") :] + elif job_file.startswith("jobs/"): + job_name = job_file[len("jobs/") :] + elif job_file.__contains__("/scripts.zip/jobs/"): # To deal with cases like job_file = '/mnt/tmp/spark-48e465ad-cca8-4216-a77f-ce069d04766f/userFiles-b1dad8aa-76ea-4adf-97da-dc9273666263/scripts.zip/jobs/infojobs/churn_prediction/users_inscriptions_daily.py' that appeared in new emr version. - job_name = job_file[job_file.find('/scripts.zip/jobs/')+len('/scripts.zip/jobs/'):] + job_name = job_file[ + job_file.find("/scripts.zip/jobs/") + len("/scripts.zip/jobs/") : + ] else: # To deal with case when job is defined outside of this repo (and not in jobs/ folder in external folder), i.e. isn't located in 'jobs/' folder. In this case, job name in metadata file should include full path (inc job base path). job_name = job_file @@ -724,21 +1069,23 @@ def set_job_name_from_file(job_file): @staticmethod def set_py_job_from_name(job_name): - py_job='jobs/{}'.format(job_name) + py_job = "jobs/{}".format(job_name) logger.info("py_job: '{}', from job_name: '{}'".format(py_job, job_name)) return py_job @staticmethod def set_sql_file_from_name(job_name, mode): - if not job_name.endswith('.sql'): + if not job_name.endswith(".sql"): return None - if mode in ('dev_EMR', 'prod_EMR'): - sql_file=CLUSTER_APP_FOLDER+'jobs/{}'.format(job_name) - elif mode == 'dev_local': - sql_file='jobs/{}'.format(job_name) + if mode in ("dev_EMR", "prod_EMR"): + sql_file = CLUSTER_APP_FOLDER + "jobs/{}".format(job_name) + elif mode == "dev_local": + sql_file = "jobs/{}".format(job_name) else: - raise Exception("Mode not supported in set_sql_file_from_name(): {}".format(mode)) + raise Exception( + "Mode not supported in set_sql_file_from_name(): {}".format(mode) + ) logger.info("sql_file: '{}', from job_name: '{}'".format(sql_file, job_name)) return sql_file @@ -748,35 +1095,62 @@ def set_job_yml(self, job_name, job_param_file, yml_mode, skip_job): return {} yml = self.load_meta(job_param_file) - if job_name not in yml['jobs'] and not skip_job: - raise KeyError("Your job '{}' can't be found in jobs_metadata file '{}'. Add it there or make sure the name matches".format(job_name, job_param_file)) - elif job_name not in yml['jobs'] and skip_job: + if job_name not in yml["jobs"] and not skip_job: + raise KeyError( + "Your job '{}' can't be found in jobs_metadata file '{}'. Add it there or make sure the name matches".format( + job_name, job_param_file + ) + ) + elif job_name not in yml["jobs"] and skip_job: job_yml = {} else: - job_yml = yml['jobs'][job_name] + job_yml = yml["jobs"][job_name] - if yml_mode not in yml['common_params']['mode_specific_params']: - raise KeyError("Your yml mode '{}' can't be found in jobs_metadata file '{}'. Add it there or make sure the name matches".format(yml_mode, job_param_file)) + if yml_mode not in yml["common_params"]["mode_specific_params"]: + raise KeyError( + "Your yml mode '{}' can't be found in jobs_metadata file '{}'. Add it there or make sure the name matches".format( + yml_mode, job_param_file + ) + ) - mode_spec_yml = yml['common_params']['mode_specific_params'][yml_mode] - out = yml['common_params']['all_mode_params'] + mode_spec_yml = yml["common_params"]["mode_specific_params"][yml_mode] + out = yml["common_params"]["all_mode_params"] out.update(mode_spec_yml) out.update(job_yml) return out @staticmethod def load_meta(fname): - with open(fname, 'r') as stream: + with open(fname, "r") as stream: yml = yaml.load(stream) return yml -class Job_Args_Parser(): - - DEPLOY_ARGS_LIST = ['aws_config_file', 'aws_setup', 'leave_on', 'push_secrets', 'frequency', 'start_date', - 'email', 'mode', 'deploy', 'terminate_after', 'spark_version'] - - def __init__(self, defaults_args, yml_args, job_args, cmd_args, job_name=None, loaded_inputs={}): +class Job_Args_Parser: + + DEPLOY_ARGS_LIST = [ + "aws_config_file", + "aws_setup", + "leave_on", + "push_secrets", + "frequency", + "start_date", + "email", + "mode", + "deploy", + "terminate_after", + "spark_version", + ] + + def __init__( + self, + defaults_args, + yml_args, + job_args, + cmd_args, + job_name=None, + loaded_inputs={}, + ): """Mix all params, add more and tweak them when needed (like depending on storage type, execution mode...). If yml_args not provided, it will go and get it. Sets of params: @@ -791,10 +1165,20 @@ def __init__(self, defaults_args, yml_args, job_args, cmd_args, job_name=None, l args = defaults_args.copy() args.update(job_args) args.update(cmd_args) - args.update({'job_name':job_name} if job_name else {}) - args['mode'] = 'dev_EMR' if args['mode'] == 'dev_local' and args['deploy'] in ('EMR', 'EMR_Scheduled') else args['mode'] - assert 'job_name' in args.keys() - yml_args = Job_Yml_Parser(args['job_name'], args['job_param_file'], args['mode'], args.get('skip_job', False)).yml_args + args.update({"job_name": job_name} if job_name else {}) + args["mode"] = ( + "dev_EMR" + if args["mode"] == "dev_local" + and args["deploy"] in ("EMR", "EMR_Scheduled") + else args["mode"] + ) + assert "job_name" in args.keys() + yml_args = Job_Yml_Parser( + args["job_name"], + args["job_param_file"], + args["mode"], + args.get("skip_job", False), + ).yml_args # Get merged args, with yml (order matters) # TODO: need to add business of flatten/unflatten so they can be merged cleanely. @@ -802,10 +1186,17 @@ def __init__(self, defaults_args, yml_args, job_args, cmd_args, job_name=None, l args.update(yml_args) args.update(job_args) args.update(cmd_args) - args['mode'] = 'dev_EMR' if args['mode'] == 'dev_local' and args['deploy'] in ('EMR', 'EMR_Scheduled') else args['mode'] + args["mode"] = ( + "dev_EMR" + if args["mode"] == "dev_local" + and args["deploy"] in ("EMR", "EMR_Scheduled") + else args["mode"] + ) args = self.update_args(args, loaded_inputs) - [setattr(self, key, value) for key, value in args.items()] # attach vars to self.* + [ + setattr(self, key, value) for key, value in args.items() + ] # attach vars to self.* # Other access to vars self.merged_args = args self.defaults_args = defaults_args @@ -815,17 +1206,27 @@ def __init__(self, defaults_args, yml_args, job_args, cmd_args, job_name=None, l logger.info("Job args: \n{}".format(pformat(args))) def get_deploy_args(self): - return {key: value for key, value in self.merged_args.items() if key in self.DEPLOY_ARGS_LIST} + return { + key: value + for key, value in self.merged_args.items() + if key in self.DEPLOY_ARGS_LIST + } def get_app_args(self): - return {key: value for key, value in self.merged_args.items() if key not in self.DEPLOY_ARGS_LIST or key=='mode'} + return { + key: value + for key, value in self.merged_args.items() + if key not in self.DEPLOY_ARGS_LIST or key == "mode" + } def update_args(self, args, loaded_inputs): - """ Updating params or adding new ones, according to execution environment (local, prod...)""" - args['inputs'] = self.set_inputs(args, loaded_inputs) + """Updating params or adding new ones, according to execution environment (local, prod...)""" + args["inputs"] = self.set_inputs(args, loaded_inputs) # args['output'] = self.set_output(cmd_args, yml_args) # TODO: fix later - args['is_incremental'] = self.set_is_incremental(args.get('inputs', {}), args.get('output', {})) - args['output']['type'] = args.pop('output.type', None) or args['output']['type'] + args["is_incremental"] = self.set_is_incremental( + args.get("inputs", {}), args.get("output", {}) + ) + args["output"]["type"] = args.pop("output.type", None) or args["output"]["type"] return args # TODO: modify later since not used now @@ -838,9 +1239,11 @@ def set_inputs(self, args, loaded_inputs): # inputs = {key: {'path': val['path'], 'type':input_types[key]['type']} for key, val in input_paths.items()} # return inputs if loaded_inputs: - return {key: {'path': val, 'type': 'df'} for key, val in loaded_inputs.items()} + return { + key: {"path": val, "type": "df"} for key, val in loaded_inputs.items() + } else: - return args.get('inputs', {}) + return args.get("inputs", {}) # TODO: modify later since not used now # def set_output(self, cmd_args, yml_args): @@ -859,19 +1262,22 @@ def set_inputs(self, args, loaded_inputs): # return output def set_is_incremental(self, inputs, output): - return any(['inc_field' in inputs[item] for item in inputs.keys()]) or 'inc_field' in output + return ( + any(["inc_field" in inputs[item] for item in inputs.keys()]) + or "inc_field" in output + ) -class Path_Handler(): +class Path_Handler: def __init__(self, path, base_path=None): if base_path: - path = path.format(base_path=base_path, latest='{latest}', now='{now}') + path = path.format(base_path=base_path, latest="{latest}", now="{now}") self.path = path def expand_later(self, storage): path = self.path - if '{latest}' in path: - upstream_path = path.split('{latest}')[0] + if "{latest}" in path: + upstream_path = path.split("{latest}")[0] paths = FS_Ops_Dispatcher().listdir(upstream_path) latest_date = max(paths) path = path.format(latest=latest_date) @@ -879,45 +1285,71 @@ def expand_later(self, storage): def expand_now(self, now_dt): path = self.path - if '{now}' in path: - current_time = now_dt.strftime('date%Y%m%d_time%H%M%S_utc') + if "{now}" in path: + current_time = now_dt.strftime("date%Y%m%d_time%H%M%S_utc") path = path.format(now=current_time) return path def get_base(self): - if '{latest}' in self.path: - return self.path.split('{latest}')[0] - elif '{now}' in self.path: - return self.path.split('{now}')[0] + if "{latest}" in self.path: + return self.path.split("{latest}")[0] + elif "{now}" in self.path: + return self.path.split("{now}")[0] else: return self.path -class Commandliner(): +class Commandliner: def __init__(self, Job, **job_args): parser, defaults_args = self.define_commandline_args() cmd_args = self.set_commandline_args(parser) # Building "job", which will include all job args. - if Job is None: # when job run from "python launcher.py --job_name=some_name_from_job_metadata_file" - jargs = Job_Args_Parser(defaults_args=defaults_args, yml_args=None, job_args=job_args, cmd_args=cmd_args, loaded_inputs={}) + if ( + Job is None + ): # when job run from "python launcher.py --job_name=some_name_from_job_metadata_file" + jargs = Job_Args_Parser( + defaults_args=defaults_args, + yml_args=None, + job_args=job_args, + cmd_args=cmd_args, + loaded_inputs={}, + ) Job = get_job_class(jargs.py_job) job = Job(jargs=jargs) else: # when job run from "python some_job.py" - job = Job(pre_jargs={'defaults_args':defaults_args, 'job_args': job_args, 'cmd_args':cmd_args}) # can provide jargs directly here since job_file (and so job_name) needs to be extracted from job first. So, letting job build jargs. + job = Job( + pre_jargs={ + "defaults_args": defaults_args, + "job_args": job_args, + "cmd_args": cmd_args, + } + ) # can provide jargs directly here since job_file (and so job_name) needs to be extracted from job first. So, letting job build jargs. # Executing or deploying - if job.jargs.deploy in ('none'): # when executing job code + if job.jargs.deploy in ("none"): # when executing job code self.launch_run_mode(job) - elif job.jargs.deploy in ('EMR', 'EMR_Scheduled', 'code'): # when deploying to AWS for execution there - self.launch_deploy_mode(job.jargs.get_deploy_args(), job.jargs.get_app_args()) + elif job.jargs.deploy in ( + "EMR", + "EMR_Scheduled", + "code", + ): # when deploying to AWS for execution there + self.launch_deploy_mode( + job.jargs.get_deploy_args(), job.jargs.get_app_args() + ) @staticmethod def set_commandline_args(parser): """Command line arguments take precedence over function ones.""" cmd_args, cmd_unknown_args = parser.parse_known_args() - cmd_args = {key: value for (key, value) in cmd_args.__dict__.items() if value is not None} - cmd_unknown_args = dict([item[2:].split('=') for item in cmd_unknown_args]) # imposes for unknown args to be defined with '=' and to start with '--' + cmd_args = { + key: value + for (key, value) in cmd_args.__dict__.items() + if value is not None + } + cmd_unknown_args = dict( + [item[2:].split("=") for item in cmd_unknown_args] + ) # imposes for unknown args to be defined with '=' and to start with '--' cmd_args.update(cmd_unknown_args) return cmd_args @@ -925,60 +1357,126 @@ def set_commandline_args(parser): def define_commandline_args(): # Defined here separatly for overridability. parser = argparse.ArgumentParser() - parser.add_argument("-d", "--deploy", choices=set(['none', 'EMR', 'EMR_Scheduled', 'EMR_DataPipeTest', 'code']), help="Choose where to run the job.") - parser.add_argument("-m", "--mode", choices=set(['dev_local', 'dev_EMR', 'prod_EMR']), help="Choose which set of params to use from jobs_metadata.yml file.") - parser.add_argument("-j", "--job_param_file", help="Identify file to use. It can be set to 'False' to not load any file and provide all parameters through job or command line arguments.") + parser.add_argument( + "-d", + "--deploy", + choices=set(["none", "EMR", "EMR_Scheduled", "EMR_DataPipeTest", "code"]), + help="Choose where to run the job.", + ) + parser.add_argument( + "-m", + "--mode", + choices=set(["dev_local", "dev_EMR", "prod_EMR"]), + help="Choose which set of params to use from jobs_metadata.yml file.", + ) + parser.add_argument( + "-j", + "--job_param_file", + help="Identify file to use. It can be set to 'False' to not load any file and provide all parameters through job or command line arguments.", + ) parser.add_argument("-n", "--job_name", help="Identify registry job to use.") parser.add_argument("-q", "--sql_file", help="Path to an sql file to execute.") - parser.add_argument("--connection_file", help="Identify file to use. Default to repo one.") - parser.add_argument("--jobs_folder", help="Identify the folder where job code is. Necessary if job code is outside the repo, i.e. if this is used as an external library. By default, uses the repo 'jobs/' folder.") - parser.add_argument("-s", "--storage", choices=set(['local', 's3']), help="Choose 'local' (default) or 's3'.") - parser.add_argument("-x", "--dependencies", action='store_true', help="Run the job dependencies and then the job itself") - parser.add_argument("-c", "--rerun_criteria", choices=set(['last_date', 'output_empty', 'both']), help="Choose criteria to rerun the next increment or not. 'last_date' usefull if we know data goes to a certain date. 'output_empty' not to be used if increment may be empty but later ones not. Only relevant for incremental job.") - parser.add_argument("--chain_dependencies", action='store_true', help="Run dependant jobs in a chained way, i.e. passing output to next step without dropping to disk. Only useful if ran with dependencies (-x) and requires output to be dataframes.") - parser.add_argument("-l", "--load_connectors", choices=set(['all', 'none']), help="Load java packages to enable spark connectors (s3, redshift, mysql). Set to 'none' to have faster spark start time and smaller log when connectors are not necessary. Only useful when mode=dev_local.") - parser.add_argument("-t", "--output.type", choices=set(['csv', 'parquet']), help="Override output type. Useful for development. Can be ignored otherwise.") + parser.add_argument( + "--connection_file", help="Identify file to use. Default to repo one." + ) + parser.add_argument( + "--jobs_folder", + help="Identify the folder where job code is. Necessary if job code is outside the repo, i.e. if this is used as an external library. By default, uses the repo 'jobs/' folder.", + ) + parser.add_argument( + "-s", + "--storage", + choices=set(["local", "s3"]), + help="Choose 'local' (default) or 's3'.", + ) + parser.add_argument( + "-x", + "--dependencies", + action="store_true", + help="Run the job dependencies and then the job itself", + ) + parser.add_argument( + "-c", + "--rerun_criteria", + choices=set(["last_date", "output_empty", "both"]), + help="Choose criteria to rerun the next increment or not. 'last_date' usefull if we know data goes to a certain date. 'output_empty' not to be used if increment may be empty but later ones not. Only relevant for incremental job.", + ) + parser.add_argument( + "--chain_dependencies", + action="store_true", + help="Run dependant jobs in a chained way, i.e. passing output to next step without dropping to disk. Only useful if ran with dependencies (-x) and requires output to be dataframes.", + ) + parser.add_argument( + "-l", + "--load_connectors", + choices=set(["all", "none"]), + help="Load java packages to enable spark connectors (s3, redshift, mysql). Set to 'none' to have faster spark start time and smaller log when connectors are not necessary. Only useful when mode=dev_local.", + ) + parser.add_argument( + "-t", + "--output.type", + choices=set(["csv", "parquet"]), + help="Override output type. Useful for development. Can be ignored otherwise.", + ) # Deploy specific - parser.add_argument("--aws_config_file", help="Identify file to use. Default to repo one.") - parser.add_argument("-a", "--aws_setup", help="Choose aws setup from conf/aws_config.cfg, typically 'prod' or 'dev'. Only relevant if choosing to deploy to a cluster.") - parser.add_argument("-o", "--leave_on", action='store_true', help="Use arg to not terminate cluster after running the job. Mostly for testing. Only relevant when creating a new cluster when deploy=EMR.") - parser.add_argument("-p", "--push_secrets", action='store_true', help="Pushing secrets to cluster. Only relevant if choosing to deploy to a cluster.") + parser.add_argument( + "--aws_config_file", help="Identify file to use. Default to repo one." + ) + parser.add_argument( + "-a", + "--aws_setup", + help="Choose aws setup from conf/aws_config.cfg, typically 'prod' or 'dev'. Only relevant if choosing to deploy to a cluster.", + ) + parser.add_argument( + "-o", + "--leave_on", + action="store_true", + help="Use arg to not terminate cluster after running the job. Mostly for testing. Only relevant when creating a new cluster when deploy=EMR.", + ) + parser.add_argument( + "-p", + "--push_secrets", + action="store_true", + help="Pushing secrets to cluster. Only relevant if choosing to deploy to a cluster.", + ) # --inputs and --output args can be set from job or commandline too, just not set here. defaults = { - 'deploy': 'none', - 'mode': 'dev_local', - 'job_param_file': JOBS_METADATA_FILE, - 'job_name': None, - 'sql_file': None, - 'connection_file': CONNECTION_FILE, - 'jobs_folder': JOB_FOLDER, - 'storage': 'local', - # 'dependencies': False, # only set from commandline - 'rerun_criteria': 'last_date', - # 'chain_dependencies': False, # only set from commandline - 'load_connectors': 'all', - # 'output.type': 'csv', # skipped on purpose to avoid setting it if not set in cmd line. - #-- Deploy specific below -- - 'aws_config_file': AWS_CONFIG_FILE, - 'aws_setup': 'dev', - 'code_source': 'lib', # Other options: 'repo' TODO: make it automatic so parameter not needed. - # 'leave_on': False, # only set from commandline - # 'push_secrets': False, # only set from commandline - #-- Not added in command line args: - 'enable_redshift_push': True, - 'base_path': '', - 'save_schemas': False, - 'manage_git_info': False, - 'add_created_at': 'true', # set as string to be overrideable in cmdline. - 'no_fw_cache': False, - 'spark_boot': True, # options ('spark', 'pandas') (experimental). - } + "deploy": "none", + "mode": "dev_local", + "job_param_file": JOBS_METADATA_FILE, + "job_name": None, + "sql_file": None, + "connection_file": CONNECTION_FILE, + "jobs_folder": JOB_FOLDER, + "storage": "local", + # 'dependencies': False, # only set from commandline + "rerun_criteria": "last_date", + # 'chain_dependencies': False, # only set from commandline + "load_connectors": "all", + # 'output.type': 'csv', # skipped on purpose to avoid setting it if not set in cmd line. + # -- Deploy specific below -- + "aws_config_file": AWS_CONFIG_FILE, + "aws_setup": "dev", + "code_source": "lib", # Other options: 'repo' TODO: make it automatic so parameter not needed. + # 'leave_on': False, # only set from commandline + # 'push_secrets': False, # only set from commandline + # -- Not added in command line args: + "enable_redshift_push": True, + "base_path": "", + "save_schemas": False, + "manage_git_info": False, + "add_created_at": "true", # set as string to be overrideable in cmdline. + "no_fw_cache": False, + "spark_boot": True, # options ('spark', 'pandas') (experimental). + } return parser, defaults def launch_run_mode(self, job): app_name = job.jargs.job_name if job.jargs.spark_boot is True: - sc, sc_sql = self.create_contexts(app_name, job.jargs) # TODO: set spark_version default upstream, remove it from here and from deploy.py. + sc, sc_sql = self.create_contexts( + app_name, job.jargs + ) # TODO: set spark_version default upstream, remove it from here and from deploy.py. else: sc, sc_sql = None, None @@ -990,6 +1488,7 @@ def launch_run_mode(self, job): def launch_deploy_mode(self, deploy_args, app_args): # Load deploy lib here instead of at module level to remove dependency on it when running code locally from yaetos.deploy import DeployPySparkScriptOnAws + DeployPySparkScriptOnAws(deploy_args, app_args).run() def create_contexts(self, app_name, jargs): @@ -1001,114 +1500,136 @@ def create_contexts(self, app_name, jargs): conf = SparkConf() # TODO: move spark-submit params here since it is more generic than in spark submit, params like "spark.driver.memoryOverhead" cause pb in spark submit. - if jargs.merged_args.get('driver-memoryOverhead'): # For extra overhead for python in driver (typically pandas) - conf = conf.set("spark.driver.memoryOverhead", jargs.merged_args['driver-memoryOverhead']) + if jargs.merged_args.get( + "driver-memoryOverhead" + ): # For extra overhead for python in driver (typically pandas) + conf = conf.set( + "spark.driver.memoryOverhead", + jargs.merged_args["driver-memoryOverhead"], + ) - if jargs.mode == 'dev_local' and jargs.load_connectors == 'all': + if jargs.mode == "dev_local" and jargs.load_connectors == "all": # Env vars for S3 access - credentials = boto3.Session(profile_name='default').get_credentials() - os.environ['AWS_ACCESS_KEY_ID'] = credentials.access_key - os.environ['AWS_SECRET_ACCESS_KEY'] = credentials.secret_key + credentials = boto3.Session(profile_name="default").get_credentials() + os.environ["AWS_ACCESS_KEY_ID"] = credentials.access_key + os.environ["AWS_SECRET_ACCESS_KEY"] = credentials.secret_key # JARs - package = PACKAGES_LOCAL if jargs.merged_args.get('spark_version', '2.4') == '2.4' else PACKAGES_LOCAL_ALT - package_str = ','.join(package) - conf = conf \ - .set("spark.jars.packages", package_str) \ - .set("spark.jars", JARS) + package = ( + PACKAGES_LOCAL + if jargs.merged_args.get("spark_version", "2.4") == "2.4" + else PACKAGES_LOCAL_ALT + ) + package_str = ",".join(package) + conf = conf.set("spark.jars.packages", package_str).set("spark.jars", JARS) # Setup above not needed when running from EMR where setup done in spark-submit. - if jargs.merged_args.get('emr_core_instances') == 0: - conf = conf \ - .set("spark.hadoop.fs.s3a.buffer.dir", '/tmp') \ - .set("spark.hadoop.fs.s3a.fast.upload.active.blocks", '1') + if jargs.merged_args.get("emr_core_instances") == 0: + conf = conf.set("spark.hadoop.fs.s3a.buffer.dir", "/tmp").set( + "spark.hadoop.fs.s3a.fast.upload.active.blocks", "1" + ) - spark = SparkSession.builder \ - .appName(app_name) \ - .config(conf=conf) \ - .getOrCreate() + spark = SparkSession.builder.appName(app_name).config(conf=conf).getOrCreate() sc = spark.sparkContext sc_sql = SQLContext(sc) - logger.info('Spark Config: {}'.format(sc.getConf().getAll())) + logger.info("Spark Config: {}".format(sc.getConf().getAll())) return sc, sc_sql -class Flow(): +class Flow: def __init__(self, launch_jargs, app_name): self.app_name = app_name - df = self.create_connections_jobs(launch_jargs.storage, launch_jargs.merged_args) - logger.debug('Flow app_name : {}, connection_table: {}'.format(app_name, df)) + df = self.create_connections_jobs( + launch_jargs.storage, launch_jargs.merged_args + ) + logger.debug("Flow app_name : {}, connection_table: {}".format(app_name, df)) graph = self.create_global_graph(df) # top to bottom - tree = self.create_local_tree(graph, nx.DiGraph(), app_name) # bottom to top - self.leafs = self.get_leafs(tree, leafs=[]) # bottom to top - logger.info('Sequence of jobs to be run: {}'.format(self.leafs)) - logger.info('-'*80) - logger.info('-') - launch_jargs.cmd_args.pop('job_name', None) # removing since it should be pulled from yml and not be overriden by cmd_args. - launch_jargs.job_args.pop('job_name', None) # same + tree = self.create_local_tree(graph, nx.DiGraph(), app_name) # bottom to top + self.leafs = self.get_leafs(tree, leafs=[]) # bottom to top + logger.info("Sequence of jobs to be run: {}".format(self.leafs)) + logger.info("-" * 80) + logger.info("-") + launch_jargs.cmd_args.pop( + "job_name", None + ) # removing since it should be pulled from yml and not be overriden by cmd_args. + launch_jargs.job_args.pop("job_name", None) # same self.launch_jargs = launch_jargs def run_pipeline(self, sc, sc_sql): """Load all job classes and run them""" df = {} for job_name in self.leafs: - logger.info('About to run job_name: {}'.format(job_name)) + logger.info("About to run job_name: {}".format(job_name)) # Get yml - yml_args = Job_Yml_Parser(job_name, self.launch_jargs.job_param_file, self.launch_jargs.mode).yml_args + yml_args = Job_Yml_Parser( + job_name, self.launch_jargs.job_param_file, self.launch_jargs.mode + ).yml_args # Get loaded_inputs loaded_inputs = {} if self.launch_jargs.chain_dependencies: - if yml_args.get('inputs', 'no input') == 'no input': - raise Exception("Pb with loading job_yml or finding 'inputs' parameter in it, so 'chain_dependencies' argument not useable in this case.") - for in_name, in_properties in yml_args['inputs'].items(): - if in_properties.get('from'): - loaded_inputs[in_name] = df[in_properties['from']] + if yml_args.get("inputs", "no input") == "no input": + raise Exception( + "Pb with loading job_yml or finding 'inputs' parameter in it, so 'chain_dependencies' argument not useable in this case." + ) + for in_name, in_properties in yml_args["inputs"].items(): + if in_properties.get("from"): + loaded_inputs[in_name] = df[in_properties["from"]] # Get jargs - jargs = Job_Args_Parser(self.launch_jargs.defaults_args, yml_args, self.launch_jargs.job_args, self.launch_jargs.cmd_args, loaded_inputs=loaded_inputs) - - Job = get_job_class(yml_args['py_job']) + jargs = Job_Args_Parser( + self.launch_jargs.defaults_args, + yml_args, + self.launch_jargs.job_args, + self.launch_jargs.cmd_args, + loaded_inputs=loaded_inputs, + ) + + Job = get_job_class(yml_args["py_job"]) job = Job(jargs=jargs, loaded_inputs=loaded_inputs) - df[job_name] = job.etl(sc, sc_sql) # at this point df[job_name] is unpersisted. TODO: keep it persisted. + df[job_name] = job.etl( + sc, sc_sql + ) # at this point df[job_name] is unpersisted. TODO: keep it persisted. if not self.launch_jargs.chain_dependencies: df[job_name].unpersist() del df[job_name] gc.collect() - logger.info('-'*80) - logger.info('-') + logger.info("-" * 80) + logger.info("-") @staticmethod def create_connections_jobs(storage, args): - yml = Job_Yml_Parser.load_meta(args['job_param_file']) + yml = Job_Yml_Parser.load_meta(args["job_param_file"]) connections = [] - for job_name, job_meta in yml['jobs'].items(): - dependencies = job_meta.get('dependencies') or [] + for job_name, job_meta in yml["jobs"].items(): + dependencies = job_meta.get("dependencies") or [] for dependency in dependencies: - row = {'source_job': dependency, 'destination_job': job_name} + row = {"source_job": dependency, "destination_job": job_name} connections.append(row) return pd.DataFrame(connections) @staticmethod def create_global_graph(df): - """ Directed Graph from source to target. df must contain 'source_dataset' and 'target_dataset'. + """Directed Graph from source to target. df must contain 'source_dataset' and 'target_dataset'. All other fields are attributed to target.""" DG = nx.DiGraph() for ii, item in df.iterrows(): item = item.to_dict() - source_dataset = item.pop('source_job') - target_dataset = item.pop('destination_job') - item.update({'name':target_dataset}) + source_dataset = item.pop("source_job") + target_dataset = item.pop("destination_job") + item.update({"name": target_dataset}) DG.add_edge(source_dataset, target_dataset) - DG.add_node(source_dataset, name=source_dataset) # (source_dataset, **{'name':source_dataset}) + DG.add_node( + source_dataset, name=source_dataset + ) # (source_dataset, **{'name':source_dataset}) DG.add_node(target_dataset, **item) return DG def create_local_tree(self, DG, tree, ref_node): - """ Builds tree recursively. Uses graph data structure but enforces tree to simplify downstream.""" + """Builds tree recursively. Uses graph data structure but enforces tree to simplify downstream.""" nodes = DG.predecessors(ref_node) tree.add_node(ref_node, name=DG.nodes[ref_node]) for item in nodes: @@ -1122,7 +1643,11 @@ def get_leafs(self, tree, leafs): """Recursive function to extract all leafs in order out of tree. Each pass, jobs are moved from "tree" to "leafs" variables until done. """ - cur_leafs = [node for node in tree.nodes() if tree.in_degree(node)!=0 and tree.out_degree(node)==0] + cur_leafs = [ + node + for node in tree.nodes() + if tree.in_degree(node) != 0 and tree.out_degree(node) == 0 + ] leafs += cur_leafs for leaf in cur_leafs: @@ -1134,11 +1659,12 @@ def get_leafs(self, tree, leafs): def get_job_class(py_job): - name_import = py_job.replace('/','.').replace('.py','') + name_import = py_job.replace("/", ".").replace(".py", "") import_cmd = "from {} import Job".format(name_import) namespace = {} exec(import_cmd, namespace) - return namespace['Job'] + return namespace["Job"] + def send_email(message, receiver_email, sender_email, password, smtp_server, port): context = ssl.create_default_context() From 603423a61d174b0254bba7366bc0f0ec89b8ab32 Mon Sep 17 00:00:00 2001 From: AlejandroUPC Date: Tue, 31 May 2022 03:20:04 +0200 Subject: [PATCH 08/10] :sparkles: refactor and some tests on env_dispatchers --- tests/yaetos/test_env_dispatchers.py | 67 ++++++++++ yaetos/env_dispatchers.py | 179 ++++++++++++++++++--------- 2 files changed, 186 insertions(+), 60 deletions(-) create mode 100644 tests/yaetos/test_env_dispatchers.py diff --git a/tests/yaetos/test_env_dispatchers.py b/tests/yaetos/test_env_dispatchers.py new file mode 100644 index 00000000..df7f220c --- /dev/null +++ b/tests/yaetos/test_env_dispatchers.py @@ -0,0 +1,67 @@ +from yaetos.env_dispatchers import FSOpsDispatcher +import pytest +from typing import Tuple, List + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ("s3://my-bucket", True), + ("s3a://my-bucket", True), + ], +) +def test_is_s3_path_valid_path(test_input: str, expected: bool) -> None: + assert FSOpsDispatcher().is_s3_path(test_input) == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ("s_3://", False), + ("s3//", False), + ], +) +def test_is_s3_path_invalid_path(test_input: str, expected: bool) -> None: + result = FSOpsDispatcher().is_s3_path(test_input) + assert result == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + "s3://my-bucket/a/b/e/f/v.txt", + ("my-bucket", "/a/b/e/f/v.txt", ["a", "b", "e", "f", "v.txt"]), + ), + ( + "s3a://my-cool_bucket-32/without_file/", + ("my-cool_bucket-32", "/without_file/", ["without_file"]), + ), + ], +) +def test_split_s3_path_working_paths( + test_input: str, expected: Tuple[str, str, List[str]] +) -> None: + + result = FSOpsDispatcher().split_s3_path(test_input) + assert result == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + "s3:/my-bucket/a/b/e/f/v.txt", + (None, None, None), + ), + ( + "s://my-cool_bucket-32/without_file/", + (None, None, None), + ), + ], +) +def test_split_s3_path_invalid_paths( + test_input: str, expected: Tuple[str, str, List[str]] +) -> None: + with pytest.raises(RuntimeError) as e: + result = FSOpsDispatcher().split_s3_path(test_input) diff --git a/yaetos/env_dispatchers.py b/yaetos/env_dispatchers.py index 18811076..924f52fd 100644 --- a/yaetos/env_dispatchers.py +++ b/yaetos/env_dispatchers.py @@ -4,53 +4,70 @@ import boto3 import os from io import StringIO -#from sklearn.externals import joblib # TODO: re-enable after fixing lib versions. +import re + +# from sklearn.externals import joblib # TODO: re-enable after fixing lib versions. from configparser import ConfigParser from yaetos.pandas_utils import load_df, save_pandas_local from yaetos.logger import setup_logging -logger = setup_logging('Job') +from typing import Tuple, List + +logger = setup_logging("Job") -class FS_Ops_Dispatcher(): +class FSOpsDispatcher: """Set of functions to dispatch mostly IO methods to local or cloud depending on the path being local or cloud (s3://*).""" @staticmethod - def is_s3_path(path): - return path.startswith('s3://') or path.startswith('s3a://') + def is_s3_path(path: str) -> bool: + """ + Returns own definition if a string is an s3 path or not. + """ + return bool(re.search(r"s3a?://", path)) @staticmethod - def split_s3_path(fname): - fname_parts = fname.split('s3://')[1].split('/') - bucket_name = fname_parts[0] - bucket_fname = '/'.join(fname_parts[1:]) - fname_parts = [item for item in fname_parts if item != ''] - return (bucket_name, bucket_fname, fname_parts) - + def split_s3_path(s3_path: str) -> Tuple[str, str, List[str]]: + """ + Splits a path to an s3 path in bucket name, bucket filename and filename parts. + """ + bucket_pattern = r"s3a?://(?P(.+?)(?=/))(?P.*)" + extracted_groups = re.search(bucket_pattern, s3_path) + if not extracted_groups or len(extracted_groups.groups()) < 2: + raise RuntimeError( + f"Could not parse all the parts for the s3 path {s3_path}" + ) + bucket_name = extracted_groups.group("bucket_name") + file_key = extracted_groups.group("file_key") + return (bucket_name, file_key, [f for f in file_key.split("/") if f != ""]) # --- save_metadata set of functions ---- def save_metadata(self, fname, content): - self.save_metadata_cluster(fname, content) if self.is_s3_path(fname) else self.save_metadata_local(fname, content) + self.save_metadata_cluster(fname, content) if self.is_s3_path( + fname + ) else self.save_metadata_local(fname, content) @staticmethod def save_metadata_local(fname, content): - fh = open(fname, 'w') + fh = open(fname, "w") fh.write(content) fh.close() logger.info("Created file locally: {}".format(fname)) @staticmethod def save_metadata_cluster(fname, content): - fname_parts = fname.split('s3://')[1].split('/') + fname_parts = fname.split("s3://")[1].split("/") bucket_name = fname_parts[0] - bucket_fname = '/'.join(fname_parts[1:]) + bucket_fname = "/".join(fname_parts[1:]) fake_handle = StringIO(content) - s3c = boto3.Session(profile_name='default').client('s3') + s3c = boto3.Session(profile_name="default").client("s3") s3c.put_object(Bucket=bucket_name, Key=bucket_fname, Body=fake_handle.read()) logger.info("Created file S3: {}".format(fname)) # --- save_file set of functions ---- def save_file(self, fname, content): - self.save_file_cluster(fname, content) if self.is_s3_path(fname) else self.save_file_local(fname, content) + self.save_file_cluster(fname, content) if self.is_s3_path( + fname + ) else self.save_file_local(fname, content) @staticmethod def save_file_local(fname, content): @@ -61,20 +78,26 @@ def save_file_local(fname, content): logger.info("Saved content to new file locally: {}".format(fname)) def save_file_cluster(self, fname, content): - fname_parts = fname.split('s3://')[1].split('/') + fname_parts = fname.split("s3://")[1].split("/") bucket_name = fname_parts[0] - bucket_fname = '/'.join(fname_parts[1:]) - s3c = boto3.Session(profile_name='default').client('s3') + bucket_fname = "/".join(fname_parts[1:]) + s3c = boto3.Session(profile_name="default").client("s3") - local_path = CLUSTER_APP_FOLDER+'tmp/local_'+fname_parts[-1] + local_path = CLUSTER_APP_FOLDER + "tmp/local_" + fname_parts[-1] self.save_file_local(local_path, content) - fh = open(local_path, 'rb') + fh = open(local_path, "rb") s3c.put_object(Bucket=bucket_name, Key=bucket_fname, Body=fh) - logger.info("Pushed local file to S3, from '{}' to '{}' ".format(local_path, fname)) + logger.info( + "Pushed local file to S3, from '{}' to '{}' ".format(local_path, fname) + ) # --- load_file set of functions ---- def load_file(self, fname): - return self.load_file_cluster(fname) if self.is_s3_path(fname) else self.load_file_local(fname) + return ( + self.load_file_cluster(fname) + if self.is_s3_path(fname) + else self.load_file_local(fname) + ) @staticmethod def load_file_local(fname): @@ -82,11 +105,11 @@ def load_file_local(fname): @staticmethod def load_file_cluster(fname): - fname_parts = fname.split('s3://')[1].split('/') + fname_parts = fname.split("s3://")[1].split("/") bucket_name = fname_parts[0] - bucket_fname = '/'.join(fname_parts[1:]) - local_path = CLUSTER_APP_FOLDER+'tmp/s3_'+fname_parts[-1] - s3c = boto3.Session(profile_name='default').client('s3') + bucket_fname = "/".join(fname_parts[1:]) + local_path = CLUSTER_APP_FOLDER + "tmp/s3_" + fname_parts[-1] + s3c = boto3.Session(profile_name="default").client("s3") s3c.download_file(bucket_name, bucket_fname, local_path) logger.info("Copied file from S3 '{}' to local '{}'".format(fname, local_path)) model = joblib.load(local_path) @@ -94,33 +117,51 @@ def load_file_cluster(fname): # --- listdir set of functions ---- def listdir(self, path): - return self.listdir_cluster(path) if self.is_s3_path(path) else self.listdir_local(path) + return ( + self.listdir_cluster(path) + if self.is_s3_path(path) + else self.listdir_local(path) + ) @staticmethod def listdir_local(path): return os.listdir(path) @staticmethod - def listdir_cluster(path): # TODO: rename to listdir_s3, same for similar functions from FS_Ops_Dispatcher + def listdir_cluster( + path, + ): # TODO: rename to listdir_s3, same for similar functions from FSOpsDispatcher # TODO: better handle invalid path. Crashes with "TypeError: 'NoneType' object is not iterable" at last line. - if path.startswith('s3://'): - s3_root = 's3://' - elif path.startswith('s3a://'): - s3_root = 's3a://' # necessary when pulling S3 to local automatically from spark. + if path.startswith("s3://"): + s3_root = "s3://" + elif path.startswith("s3a://"): + s3_root = ( + "s3a://" # necessary when pulling S3 to local automatically from spark. + ) else: - raise ValueError('Problem with path. Pulling from s3, it should start with "s3://" or "s3a://". Path is: {}'.format(path)) - fname_parts = path.split(s3_root)[1].split('/') + raise ValueError( + 'Problem with path. Pulling from s3, it should start with "s3://" or "s3a://". Path is: {}'.format( + path + ) + ) + fname_parts = path.split(s3_root)[1].split("/") bucket_name = fname_parts[0] - prefix = '/'.join(fname_parts[1:]) - client = boto3.Session(profile_name='default').client('s3') - paginator = client.get_paginator('list_objects') - objects = paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter='/') - paths = [item['Prefix'].split('/')[-2] for item in objects.search('CommonPrefixes')] + prefix = "/".join(fname_parts[1:]) + client = boto3.Session(profile_name="default").client("s3") + paginator = client.get_paginator("list_objects") + objects = paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter="/") + paths = [ + item["Prefix"].split("/")[-2] for item in objects.search("CommonPrefixes") + ] return paths # --- dir_exist set of functions ---- def dir_exist(self, path): - return self.dir_exist_cluster(path) if self.is_s3_path(path) else self.dir_exist_local(path) + return ( + self.dir_exist_cluster(path) + if self.is_s3_path(path) + else self.dir_exist_local(path) + ) @staticmethod def dir_exist_local(path): @@ -130,10 +171,13 @@ def dir_exist_local(path): def dir_exist_cluster(path): raise NotImplementedError - # --- load_pandas set of functions ---- def load_pandas(self, fname, file_type, read_func, read_kwargs): - return self.load_pandas_cluster(fname, file_type, read_func, read_kwargs) if self.is_s3_path(fname) else self.load_pandas_local(fname, file_type, read_func, read_kwargs) + return ( + self.load_pandas_cluster(fname, file_type, read_func, read_kwargs) + if self.is_s3_path(fname) + else self.load_pandas_local(fname, file_type, read_func, read_kwargs) + ) @staticmethod def load_pandas_local(fname, file_type, read_func, read_kwargs): @@ -144,19 +188,28 @@ def load_pandas_cluster(self, fname, file_type, read_func, read_kwargs): from cloudpathlib import CloudPath bucket_name, bucket_fname, fname_parts = self.split_s3_path(fname) - local_path = 'tmp/s3_copy_'+fname_parts[-1] - cp = CloudPath(fname) # TODO: add way to load it with specific profile_name or client, as in "s3c = boto3.Session(profile_name='default').client('s3')" - logger.info("Copying files from S3 '{}' to local '{}'. May take some time.".format(fname, local_path)) + local_path = "tmp/s3_copy_" + fname_parts[-1] + cp = CloudPath( + fname + ) # TODO: add way to load it with specific profile_name or client, as in "s3c = boto3.Session(profile_name='default').client('s3')" + logger.info( + "Copying files from S3 '{}' to local '{}'. May take some time.".format( + fname, local_path + ) + ) local_pathlib = cp.download_to(local_path) - local_path = local_path + '/' if local_pathlib.is_dir() else local_path + local_path = local_path + "/" if local_pathlib.is_dir() else local_path logger.info("File copy finished") df = load_df(local_path, file_type, read_func, read_kwargs) return df - # --- save_pandas set of functions ---- def save_pandas(self, df, fname, save_method, save_kwargs): - return self.save_pandas_cluster(df, fname, save_method, save_kwargs) if self.is_s3_path(fname) else self.save_pandas_local(df, fname, save_method, save_kwargs) + return ( + self.save_pandas_cluster(df, fname, save_method, save_kwargs) + if self.is_s3_path(fname) + else self.save_pandas_local(df, fname, save_method, save_kwargs) + ) @staticmethod def save_pandas_local(df, fname, save_method, save_kwargs): @@ -167,26 +220,32 @@ def save_pandas_cluster(self, df, fname, save_method, save_kwargs): bucket_name, bucket_fname, fname_parts = self.split_s3_path(fname) with StringIO() as file_buffer: save_pandas_local(df, file_buffer, save_method, save_kwargs) - s3c = boto3.Session(profile_name='default').client('s3') - response = s3c.put_object(Bucket=bucket_name, Key=bucket_fname, Body=file_buffer.getvalue()) + s3c = boto3.Session(profile_name="default").client("s3") + response = s3c.put_object( + Bucket=bucket_name, Key=bucket_fname, Body=file_buffer.getvalue() + ) logger.info("Created file in S3: {}".format(fname)) return df -class Cred_Ops_Dispatcher(): - def retrieve_secrets(self, storage, creds='conf/connections.cfg'): - creds = self.retrieve_secrets_cluster() if storage=='s3' else self.retrieve_secrets_local(creds) +class Cred_Ops_Dispatcher: + def retrieve_secrets(self, storage, creds="conf/connections.cfg"): + creds = ( + self.retrieve_secrets_cluster() + if storage == "s3" + else self.retrieve_secrets_local(creds) + ) return creds @staticmethod def retrieve_secrets_cluster(): - client = boto3.Session(profile_name='default').client('secretsmanager') + client = boto3.Session(profile_name="default").client("secretsmanager") response = client.get_secret_value(SecretId=AWS_SECRET_ID) - logger.info('Read aws secret, secret_id:'+AWS_SECRET_ID) - logger.debug('get_secret_value response: '+str(response)) - content = response['SecretString'] + logger.info("Read aws secret, secret_id:" + AWS_SECRET_ID) + logger.debug("get_secret_value response: " + str(response)) + content = response["SecretString"] fake_handle = StringIO(content) config = ConfigParser() From b996fe5b4f7231f9c92424718ca802de25db85e9 Mon Sep 17 00:00:00 2001 From: AlejandroUPC Date: Tue, 31 May 2022 03:20:40 +0200 Subject: [PATCH 09/10] :wrench: big refactor, class following pep8, context managers,etc.. --- jobs/examples/ex0_extraction_job.py | 4 +- jobs/examples/ex10_troubleshoot_job.py | 16 +- jobs/examples/ex1_frameworked_job.py | 4 +- jobs/examples/ex2_frameworked_job.py | 4 +- jobs/examples/ex3_incremental_job.py | 5 +- jobs/examples/ex3_incremental_prep_job.py | 4 +- jobs/examples/ex4_dependency1_job.py | 4 +- jobs/examples/ex4_dependency2_job.py | 12 +- jobs/examples/ex4_dependency4_job.py | 12 +- jobs/examples/ex5_copy_to_oracle_job.py | 18 +- jobs/examples/ex5_copy_to_redshift_job.py | 18 +- jobs/examples/ex5_input_from_oracle_job.py | 12 +- jobs/examples/ex7_extraction_small_job.py | 12 +- jobs/examples/ex7_hybrid_pandas_spark_job.py | 36 +- jobs/examples/ex7_pandas_job.py | 34 +- jobs/examples/ex8_koalas_job.py | 17 +- jobs/examples/ex9_clickhouse_job.py | 38 +- jobs/examples/ex9_mysql_job.py | 35 +- jobs/examples/ex9_redshift_job.py | 66 +- jobs/examples/wordcount_frameworked_job.py | 12 +- jobs/generic/copy_job.py | 7 +- jobs/generic/dummy_job.py | 7 +- .../jobs/examples/ex1_frameworked_job_test.py | 49 +- tests/jobs/examples/ex1_full_sql_job_test.py | 51 +- .../jobs/examples/ex4_dependency1_job_test.py | 37 +- .../jobs/examples/ex4_dependency2_job_test.py | 37 +- .../ex7_hybrid_pandas_spark_job_test.py | 47 +- tests/jobs/examples/ex7_pandas_job_test.py | 42 +- tests/yaetos/etl_utils_test.py | 266 +++-- yaetos/daily_incremental_job.py | 36 +- yaetos/deploy.py | 935 ++++++++++++------ yaetos/etl_utils.py | 20 +- yaetos/git_utils.py | 80 +- yaetos/kafka_utils.py | 2 +- yaetos/libs/generic_jobs/copy_job.py | 7 +- yaetos/libs/generic_jobs/dummy_job.py | 7 +- yaetos/logger.py | 8 +- yaetos/mysql_job.py | 19 +- yaetos/oracle_sql_job.py | 4 +- yaetos/scripts/copy/ex0_extraction_job.py | 22 +- yaetos/scripts/copy/ex1_frameworked_job.py | 12 +- .../scripts/copy/ex1_frameworked_job_test.py | 47 +- yaetos/scripts/copy/ex1_full_sql_job_test.py | 51 +- yaetos/scripts/install_env.py | 127 ++- yaetos/sql_job.py | 22 +- 45 files changed, 1472 insertions(+), 833 deletions(-) diff --git a/jobs/examples/ex0_extraction_job.py b/jobs/examples/ex0_extraction_job.py index 171680d2..ff5176f5 100644 --- a/jobs/examples/ex0_extraction_job.py +++ b/jobs/examples/ex0_extraction_job.py @@ -1,12 +1,12 @@ """ Demo basic extraction job using a public datasource (from wikimedia) """ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner import requests import os import pandas as pd from pyspark import sql -class Job(ETL_Base): +class Job(ETLBase): def transform(self) -> sql.DataFrame: url = self.jargs.api_inputs["path"] resp = requests.get(url, allow_redirects=True) diff --git a/jobs/examples/ex10_troubleshoot_job.py b/jobs/examples/ex10_troubleshoot_job.py index 6fdb1633..093b52e9 100644 --- a/jobs/examples/ex10_troubleshoot_job.py +++ b/jobs/examples/ex10_troubleshoot_job.py @@ -1,23 +1,27 @@ """To show troubleshooting, done through 'import ipdb; ipdb.set_trace()'.""" -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner -class Job(ETL_Base): +class Job(ETLBase): def transform(self, some_events, other_events): - df = self.query(""" + df = self.query( + """ SELECT se.session_id, count(*) as count_events FROM some_events se JOIN other_events oe on se.session_id=oe.session_id WHERE se.action='searchResultPage' and se.n_results>0 group by se.session_id order by count(*) desc - """) + """ + ) - import ipdb; ipdb.set_trace() # will drop to python terminal here to inspect + import ipdb + + ipdb.set_trace() # will drop to python terminal here to inspect return df if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/examples/ex1_frameworked_job.py b/jobs/examples/ex1_frameworked_job.py index 0028f6d5..d7a67292 100644 --- a/jobs/examples/ex1_frameworked_job.py +++ b/jobs/examples/ex1_frameworked_job.py @@ -1,9 +1,9 @@ """Same as ex1_full_sql_job.sql but allows access to spark for more complex ops (not used here but in ex2_frameworked_job.py).""" -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner from pyspark import sql -class Job(ETL_Base): +class Job(ETLBase): def transform( self, some_events="some_events", other_events="other_events" ) -> sql.DataFrame: diff --git a/jobs/examples/ex2_frameworked_job.py b/jobs/examples/ex2_frameworked_job.py index b5570d3f..6bce9a5f 100644 --- a/jobs/examples/ex2_frameworked_job.py +++ b/jobs/examples/ex2_frameworked_job.py @@ -1,4 +1,4 @@ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner from pyspark.sql.functions import udf, array from pyspark.sql.types import StringType, IntegerType from pyspark.sql.functions import col @@ -6,7 +6,7 @@ from datetime import datetime -class Job(ETL_Base): +class Job(ETLBase): def transform(self, some_events, other_events) -> sql.DataFrame: """For demo only. Functional but no specific business logic.""" diff --git a/jobs/examples/ex3_incremental_job.py b/jobs/examples/ex3_incremental_job.py index faffae68..73a02277 100644 --- a/jobs/examples/ex3_incremental_job.py +++ b/jobs/examples/ex3_incremental_job.py @@ -1,7 +1,8 @@ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner from pyspark import sql -class Job(ETL_Base): + +class Job(ETLBase): def transform(self, processed_events="processed_events") -> sql.DataFrame: df = self.query( f""" diff --git a/jobs/examples/ex3_incremental_prep_job.py b/jobs/examples/ex3_incremental_prep_job.py index 7bdb3696..45bd9299 100644 --- a/jobs/examples/ex3_incremental_prep_job.py +++ b/jobs/examples/ex3_incremental_prep_job.py @@ -1,10 +1,10 @@ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner from pyspark.sql.functions import udf, array from pyspark.sql.types import StringType, IntegerType from pyspark.sql.functions import col -class Job(ETL_Base): +class Job(ETLBase): def transform(self, some_events): udf_format_datetime = udf(self.format_datetime, StringType()) diff --git a/jobs/examples/ex4_dependency1_job.py b/jobs/examples/ex4_dependency1_job.py index 637991da..be5df2b3 100644 --- a/jobs/examples/ex4_dependency1_job.py +++ b/jobs/examples/ex4_dependency1_job.py @@ -1,8 +1,8 @@ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner from pyspark import sql -class Job(ETL_Base): +class Job(ETLBase): def transform(self, some_events: str = "some_events") -> sql.DataFrame: return self.query( f""" diff --git a/jobs/examples/ex4_dependency2_job.py b/jobs/examples/ex4_dependency2_job.py index f3ecf736..a8bed6d5 100644 --- a/jobs/examples/ex4_dependency2_job.py +++ b/jobs/examples/ex4_dependency2_job.py @@ -1,18 +1,20 @@ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner from pyspark.sql.functions import udf, array from pyspark.sql.types import StringType, IntegerType from pyspark.sql.functions import col -class Job(ETL_Base): +class Job(ETLBase): def transform(self, some_events): - df = self.query(""" + df = self.query( + """ SELECT se.session_id, session_length, session_length*2 as doubled_length FROM some_events se - """) + """ + ) return df if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/examples/ex4_dependency4_job.py b/jobs/examples/ex4_dependency4_job.py index 201be541..39067f02 100644 --- a/jobs/examples/ex4_dependency4_job.py +++ b/jobs/examples/ex4_dependency4_job.py @@ -1,18 +1,20 @@ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner from pyspark.sql.functions import udf, array from pyspark.sql.types import StringType, IntegerType from pyspark.sql.functions import col -class Job(ETL_Base): +class Job(ETLBase): def transform(self, some_events): - df = self.query(""" + df = self.query( + """ SELECT * , session_length*8 as D FROM some_events se - """) + """ + ) return df if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/examples/ex5_copy_to_oracle_job.py b/jobs/examples/ex5_copy_to_oracle_job.py index d4da6d28..73f55df0 100644 --- a/jobs/examples/ex5_copy_to_oracle_job.py +++ b/jobs/examples/ex5_copy_to_oracle_job.py @@ -1,25 +1,27 @@ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner from sqlalchemy import types -class Job(ETL_Base): +class Job(ETLBase): OUTPUT_TYPES = { - 'session_id': types.VARCHAR(16), - 'count_events': types.INT(), - } + "session_id": types.VARCHAR(16), + "count_events": types.INT(), + } def transform(self, some_events, other_events): - df = self.query(""" + df = self.query( + """ SELECT se.session_id, count(*) as count_events FROM some_events se JOIN other_events oe on se.session_id=oe.session_id WHERE se.action='searchResultPage' and se.n_results>0 group by se.session_id order by count(*) desc - """) + """ + ) return df if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/examples/ex5_copy_to_redshift_job.py b/jobs/examples/ex5_copy_to_redshift_job.py index d4da6d28..73f55df0 100644 --- a/jobs/examples/ex5_copy_to_redshift_job.py +++ b/jobs/examples/ex5_copy_to_redshift_job.py @@ -1,25 +1,27 @@ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner from sqlalchemy import types -class Job(ETL_Base): +class Job(ETLBase): OUTPUT_TYPES = { - 'session_id': types.VARCHAR(16), - 'count_events': types.INT(), - } + "session_id": types.VARCHAR(16), + "count_events": types.INT(), + } def transform(self, some_events, other_events): - df = self.query(""" + df = self.query( + """ SELECT se.session_id, count(*) as count_events FROM some_events se JOIN other_events oe on se.session_id=oe.session_id WHERE se.action='searchResultPage' and se.n_results>0 group by se.session_id order by count(*) desc - """) + """ + ) return df if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/examples/ex5_input_from_oracle_job.py b/jobs/examples/ex5_input_from_oracle_job.py index e37490b9..cb6733f8 100644 --- a/jobs/examples/ex5_input_from_oracle_job.py +++ b/jobs/examples/ex5_input_from_oracle_job.py @@ -1,15 +1,15 @@ -from yaetos.etl_utils import ETL_Base, Commandliner, Cred_Ops_Dispatcher +from yaetos.etl_utils import ETLBase, Commandliner, Cred_Ops_Dispatcher from yaetos.db_utils import pdf_to_sdf from libs.python_db_connectors.query_oracle import query as query_oracle from sqlalchemy import types -class Job(ETL_Base): +class Job(ETLBase): OUTPUT_TYPES = { - 'session_id': types.VARCHAR(16), - 'count_events': types.INT(), - } + "session_id": types.VARCHAR(16), + "count_events": types.INT(), + } def transform(self): cred_profiles = Cred_Ops_Dispatcher().retrieve_secrets(self.jargs.storage) @@ -25,5 +25,5 @@ def transform(self): if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/examples/ex7_extraction_small_job.py b/jobs/examples/ex7_extraction_small_job.py index 086e29ee..bb1ec717 100644 --- a/jobs/examples/ex7_extraction_small_job.py +++ b/jobs/examples/ex7_extraction_small_job.py @@ -1,17 +1,19 @@ """Job to get small version of wiki sample data, to speed up running downstream jobs, for testing purposes.""" -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner # TODO: move it to .sql job. -class Job(ETL_Base): +class Job(ETLBase): def transform(self, events): - df = self.query(""" + df = self.query( + """ SELECT * FROM events LIMIT 1000 - """) + """ + ) return df if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/examples/ex7_hybrid_pandas_spark_job.py b/jobs/examples/ex7_hybrid_pandas_spark_job.py index c3741611..743807d1 100644 --- a/jobs/examples/ex7_hybrid_pandas_spark_job.py +++ b/jobs/examples/ex7_hybrid_pandas_spark_job.py @@ -1,15 +1,13 @@ """Same transformation as ex1_full_sql_job.sql, done in pandas""" -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner from yaetos.db_utils import pdf_to_sdf from sqlalchemy import types import pandas as pd import numpy as np -class Job(ETL_Base): - OUTPUT_TYPES = { - 'session_id': types.VARCHAR(100), - 'count_events': types.INT()} +class Job(ETLBase): + OUTPUT_TYPES = {"session_id": types.VARCHAR(100), "count_events": types.INT()} def transform(self, some_events, other_events): # Conversions of inputs to pandas @@ -18,13 +16,27 @@ def transform(self, some_events, other_events): # Transformation in pandas some_events_pd = some_events_pd[:1000] - df1 = some_events_pd[some_events_pd.apply(lambda row: row['action'] == 'searchResultPage' and float(row['n_results'])>0, axis=1)] - df2 = pd.merge(left=df1, right=other_events_pd, how='inner', left_on='session_id', right_on='session_id', indicator = True, suffixes=('_1', '_2')) - df3 = df2.groupby(by=['session_id']).agg({'_merge': np.count_nonzero}) - df3.rename(columns={'_merge':'count_events'}, inplace=True) - df3.sort_values('count_events', ascending=False, inplace=True) + df1 = some_events_pd[ + some_events_pd.apply( + lambda row: row["action"] == "searchResultPage" + and float(row["n_results"]) > 0, + axis=1, + ) + ] + df2 = pd.merge( + left=df1, + right=other_events_pd, + how="inner", + left_on="session_id", + right_on="session_id", + indicator=True, + suffixes=("_1", "_2"), + ) + df3 = df2.groupby(by=["session_id"]).agg({"_merge": np.count_nonzero}) + df3.rename(columns={"_merge": "count_events"}, inplace=True) + df3.sort_values("count_events", ascending=False, inplace=True) df3.reset_index(drop=False, inplace=True) - self.logger.info('Post filter length: {}'.format(len(df1))) + self.logger.info("Post filter length: {}".format(len(df1))) # Conversion of output to Spark sdf = pdf_to_sdf(df3, self.OUTPUT_TYPES, self.sc, self.sc_sql) @@ -32,5 +44,5 @@ def transform(self, some_events, other_events): if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/examples/ex7_pandas_job.py b/jobs/examples/ex7_pandas_job.py index 1773894f..86b7fa08 100644 --- a/jobs/examples/ex7_pandas_job.py +++ b/jobs/examples/ex7_pandas_job.py @@ -3,24 +3,38 @@ This allows for faster run for small datasets but looses some of the benefits of spark dataframes (support for SQL, better field type management, etc.). Transformation is the same as ex1_full_sql_job.sql """ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner import pandas as pd import numpy as np -class Job(ETL_Base): +class Job(ETLBase): def transform(self, some_events, other_events): - some_events = some_events[:1000] # to speed up since it is an example job - df1 = some_events[some_events.apply(lambda row: row['action'] == 'searchResultPage' and float(row['n_results'])>0, axis=1)] - df2 = pd.merge(left=df1, right=other_events, how='inner', left_on='session_id', right_on='session_id', indicator = True, suffixes=('_1', '_2')) - df3 = df2.groupby(by=['session_id']).agg({'_merge': np.count_nonzero}) - df3.rename(columns={'_merge':'count_events'}, inplace=True) - df3.sort_values('count_events', ascending=False, inplace=True) + some_events = some_events[:1000] # to speed up since it is an example job + df1 = some_events[ + some_events.apply( + lambda row: row["action"] == "searchResultPage" + and float(row["n_results"]) > 0, + axis=1, + ) + ] + df2 = pd.merge( + left=df1, + right=other_events, + how="inner", + left_on="session_id", + right_on="session_id", + indicator=True, + suffixes=("_1", "_2"), + ) + df3 = df2.groupby(by=["session_id"]).agg({"_merge": np.count_nonzero}) + df3.rename(columns={"_merge": "count_events"}, inplace=True) + df3.sort_values("count_events", ascending=False, inplace=True) df3.reset_index(drop=False, inplace=True) - self.logger.info('Post filter length: {}'.format(len(df1))) + self.logger.info("Post filter length: {}".format(len(df1))) return df3 if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/examples/ex8_koalas_job.py b/jobs/examples/ex8_koalas_job.py index 5c8eb2a1..d3323a49 100644 --- a/jobs/examples/ex8_koalas_job.py +++ b/jobs/examples/ex8_koalas_job.py @@ -1,20 +1,23 @@ """To show koalas API. Same process as SQL code in ex2_frameworked_job.py""" -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner import pandas as pd import numpy as np import databricks.koalas as ks -class Job(ETL_Base): + +class Job(ETLBase): def transform(self, some_events, other_events): # Convert spark df to koalas df se_kdf = some_events.to_koalas() oe_kdf = other_events.to_koalas() # processing - se_kdf = se_kdf[se_kdf['action']=='searchResultPage'] - se_kdf = se_kdf[se_kdf['n_results']>0] - merged_kdf = ks.merge(se_kdf, oe_kdf, on='session_id', how='inner', suffixes=('_l','_r')) - grouped = merged_kdf.groupby(by=['session_id']).count() + se_kdf = se_kdf[se_kdf["action"] == "searchResultPage"] + se_kdf = se_kdf[se_kdf["n_results"] > 0] + merged_kdf = ks.merge( + se_kdf, oe_kdf, on="session_id", how="inner", suffixes=("_l", "_r") + ) + grouped = merged_kdf.groupby(by=["session_id"]).count() # back to spark df sdf = grouped.to_spark() @@ -22,5 +25,5 @@ def transform(self, some_events, other_events): if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/examples/ex9_clickhouse_job.py b/jobs/examples/ex9_clickhouse_job.py index ebd91777..eb2a3dcd 100644 --- a/jobs/examples/ex9_clickhouse_job.py +++ b/jobs/examples/ex9_clickhouse_job.py @@ -2,34 +2,40 @@ Typically not needed since data is read/written to clickhouse from framework, as defined in job_metadata.yml. May require VPN to access clickhouse. """ -from yaetos.etl_utils import ETL_Base, Commandliner, Cred_Ops_Dispatcher +from yaetos.etl_utils import ETLBase, Commandliner, Cred_Ops_Dispatcher -class Job(ETL_Base): +class Job(ETLBase): def transform(self): - creds = Cred_Ops_Dispatcher().retrieve_secrets(self.jargs.storage, creds=self.jargs.connection_file) - creds_section = self.jargs.yml_args['db_inputs']['creds'] + creds = Cred_Ops_Dispatcher().retrieve_secrets( + self.jargs.storage, creds=self.jargs.connection_file + ) + creds_section = self.jargs.yml_args["db_inputs"]["creds"] db = creds[creds_section] - url = 'jdbc:postgresql://{host}/{service}'.format(host=db['host'], service=db['service']) - dbtable = 'some.table' + url = "jdbc:postgresql://{host}/{service}".format( + host=db["host"], service=db["service"] + ) + dbtable = "some.table" # Reading from clickhouse self.logger.info('Pulling table "{}" from clickhouse'.format(dbtable)) - df = self.sc_sql.read \ - .format('jdbc') \ - .option('driver', "org.postgresql.Driver") \ - .option("url", url) \ - .option("user", db['user']) \ - .option("password", db['password']) \ - .option("dbtable", dbtable)\ + df = ( + self.sc_sql.read.format("jdbc") + .option("driver", "org.postgresql.Driver") + .option("url", url) + .option("user", db["user"]) + .option("password", db["password"]) + .option("dbtable", dbtable) .load() + ) count = df.count() - self.logger.info('Done pulling table, row count:{}'.format(count)) + self.logger.info("Done pulling table, row count:{}".format(count)) return df if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml', - 'load_connectors': 'all', + args = { + "job_param_file": "conf/jobs_metadata.yml", + "load_connectors": "all", } Commandliner(Job, **args) diff --git a/jobs/examples/ex9_mysql_job.py b/jobs/examples/ex9_mysql_job.py index 424d7f2e..47593a64 100644 --- a/jobs/examples/ex9_mysql_job.py +++ b/jobs/examples/ex9_mysql_job.py @@ -2,35 +2,40 @@ Typically not needed since data is read/written to mysql from framework, as defined in job_metadata.yml. May require VPN to access mysql. """ -from yaetos.etl_utils import ETL_Base, Commandliner, Cred_Ops_Dispatcher +from yaetos.etl_utils import ETLBase, Commandliner, Cred_Ops_Dispatcher -class Job(ETL_Base): +class Job(ETLBase): def transform(self): - creds = Cred_Ops_Dispatcher().retrieve_secrets(self.jargs.storage, creds=self.jargs.connection_file) - creds_section = self.jargs.yml_args['db_inputs']['creds'] + creds = Cred_Ops_Dispatcher().retrieve_secrets( + self.jargs.storage, creds=self.jargs.connection_file + ) + creds_section = self.jargs.yml_args["db_inputs"]["creds"] db = creds[creds_section] - url = 'jdbc:mysql://{host}:{port}/{service}'.format(host=db['host'], port=db['port'], service=db['service']) - dbtable = 'some.table' + url = "jdbc:mysql://{host}:{port}/{service}".format( + host=db["host"], port=db["port"], service=db["service"] + ) + dbtable = "some.table" # # Writing to mysql # TODO: add example # Reading from mysql self.logger.info('Pulling table "{}" from mysql'.format(dbtable)) - df = self.sc_sql.read \ - .format('jdbc') \ - .option('driver', "com.mysql.jdbc.Driver") \ - .option("url", url) \ - .option("user", db['user']) \ - .option("password", db['password']) \ - .option("dbtable", dbtable)\ + df = ( + self.sc_sql.read.format("jdbc") + .option("driver", "com.mysql.jdbc.Driver") + .option("url", url) + .option("user", db["user"]) + .option("password", db["password"]) + .option("dbtable", dbtable) .load() + ) count = df.count() - self.logger.info('Done pulling table, row count:{}'.format(count)) + self.logger.info("Done pulling table, row count:{}".format(count)) return df if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/examples/ex9_redshift_job.py b/jobs/examples/ex9_redshift_job.py index 94cb42e6..ac41fcc2 100644 --- a/jobs/examples/ex9_redshift_job.py +++ b/jobs/examples/ex9_redshift_job.py @@ -2,47 +2,61 @@ Typically not needed since data is read/written to redshift from framework, as defined in job_metadata.yml. May require VPN to access redshift. """ -from yaetos.etl_utils import ETL_Base, Commandliner, Cred_Ops_Dispatcher, REDSHIFT_S3_TMP_DIR +from yaetos.etl_utils import ( + ETLBase, + Commandliner, + Cred_Ops_Dispatcher, + REDSHIFT_S3_TMP_DIR, +) -class Job(ETL_Base): +class Job(ETLBase): def transform(self, some_events): - creds = Cred_Ops_Dispatcher().retrieve_secrets(self.jargs.storage, creds=self.jargs.connection_file) - creds_section = self.jargs.yml_args['copy_to_redshift']['creds'] + creds = Cred_Ops_Dispatcher().retrieve_secrets( + self.jargs.storage, creds=self.jargs.connection_file + ) + creds_section = self.jargs.yml_args["copy_to_redshift"]["creds"] db = creds[creds_section] - url = 'jdbc:redshift://{host}:{port}/{service}'.format(host=db['host'], port=db['port'], service=db['service']) - dbtable = 'sandbox.test_ex9_redshift' + url = "jdbc:redshift://{host}:{port}/{service}".format( + host=db["host"], port=db["port"], service=db["service"] + ) + dbtable = "sandbox.test_ex9_redshift" # Writing to redshift - self.logger.info('Sending table "{}" to redshift, size "{}".'.format(dbtable, some_events.count())) - some_events.write \ - .format('com.databricks.spark.redshift') \ - .option("tempdir", REDSHIFT_S3_TMP_DIR) \ - .option("url", url) \ - .option("user", db['user']) \ - .option("password", db['password']) \ - .option("dbtable", dbtable) \ - .mode('overwrite') \ - .save() - self.logger.info('Done sending table') + self.logger.info( + 'Sending table "{}" to redshift, size "{}".'.format( + dbtable, some_events.count() + ) + ) + some_events.write.format("com.databricks.spark.redshift").option( + "tempdir", REDSHIFT_S3_TMP_DIR + ).option("url", url).option("user", db["user"]).option( + "password", db["password"] + ).option( + "dbtable", dbtable + ).mode( + "overwrite" + ).save() + self.logger.info("Done sending table") # Reading from redshift self.logger.info('Pulling table "{}" from redshift'.format(dbtable)) - df = self.sc_sql.read \ - .format('com.databricks.spark.redshift') \ - .option("tempdir", REDSHIFT_S3_TMP_DIR) \ - .option("url", url) \ - .option("user", db['user']) \ - .option("password", db['password']) \ - .option("dbtable", dbtable)\ + df = ( + self.sc_sql.read.format("com.databricks.spark.redshift") + .option("tempdir", REDSHIFT_S3_TMP_DIR) + .option("url", url) + .option("user", db["user"]) + .option("password", db["password"]) + .option("dbtable", dbtable) .load() + ) count = df.count() - self.logger.info('Done pulling table, row count:{}'.format(count)) + self.logger.info("Done pulling table, row count:{}".format(count)) # Output table will also be sent to redshift as required by job_metadata.yml return some_events if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/examples/wordcount_frameworked_job.py b/jobs/examples/wordcount_frameworked_job.py index 45730d15..a087cd17 100644 --- a/jobs/examples/wordcount_frameworked_job.py +++ b/jobs/examples/wordcount_frameworked_job.py @@ -1,15 +1,15 @@ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner from operator import add -class Job(ETL_Base): +class Job(ETLBase): def transform(self, lines): - counts = lines.flatMap(lambda x: x.split(' ')) \ - .map(lambda x: (x, 1)) \ - .reduceByKey(add) + counts = ( + lines.flatMap(lambda x: x.split(" ")).map(lambda x: (x, 1)).reduceByKey(add) + ) return counts if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/generic/copy_job.py b/jobs/generic/copy_job.py index d9f1dda9..1285a155 100644 --- a/jobs/generic/copy_job.py +++ b/jobs/generic/copy_job.py @@ -1,8 +1,7 @@ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner -class Job(ETL_Base): - +class Job(ETLBase): def transform(self, table_to_copy): table_to_copy.cache() if table_to_copy.count() < 500000: @@ -11,5 +10,5 @@ def transform(self, table_to_copy): if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/jobs/generic/dummy_job.py b/jobs/generic/dummy_job.py index 2af5183f..3e593ff1 100644 --- a/jobs/generic/dummy_job.py +++ b/jobs/generic/dummy_job.py @@ -1,11 +1,12 @@ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner from pyspark.sql.types import StructType -class Job(ETL_Base): + +class Job(ETLBase): def transform(self): return self.sc_sql.createDataFrame([], StructType([])) if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/tests/jobs/examples/ex1_frameworked_job_test.py b/tests/jobs/examples/ex1_frameworked_job_test.py index 75f36ba6..6b3bdb3f 100644 --- a/tests/jobs/examples/ex1_frameworked_job_test.py +++ b/tests/jobs/examples/ex1_frameworked_job_test.py @@ -1,27 +1,40 @@ import pytest -from jobs.examples.ex1_frameworked_job import Job +from yaetos.tests.jobs.examples.ex1_frameworked_job import Job -class Test_Job(object): +class TestJob: def test_transform(self, sc, sc_sql, ss, get_pre_jargs): - some_events = ss.read.json(sc.parallelize([ - {'session_id': 1234, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1234, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1235, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1236, 'action': 'other', 'n_results': 1}, - ])) + some_events = ss.read.json( + sc.parallelize( + [ + {"session_id": 1234, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1234, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1235, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1236, "action": "other", "n_results": 1}, + ] + ) + ) - other_events = ss.read.json(sc.parallelize([ - {'session_id': 1234, 'other': 1}, - {'session_id': 1235, 'other': 1}, - {'session_id': 1236, 'other': 1}, - ])) + other_events = ss.read.json( + sc.parallelize( + [ + {"session_id": 1234, "other": 1}, + {"session_id": 1235, "other": 1}, + {"session_id": 1236, "other": 1}, + ] + ) + ) expected = [ - {'session_id': 1234, 'count_events': 2}, - {'session_id': 1235, 'count_events': 1}, - ] + {"session_id": 1234, "count_events": 2}, + {"session_id": 1235, "count_events": 1}, + ] - loaded_inputs={'some_events': some_events, 'other_events':other_events} - actual = Job(pre_jargs=get_pre_jargs(loaded_inputs.keys())).etl_no_io(sc, sc_sql, loaded_inputs=loaded_inputs)[0].toPandas().to_dict(orient='records') + loaded_inputs = {"some_events": some_events, "other_events": other_events} + actual = ( + Job(pre_jargs=get_pre_jargs(loaded_inputs.keys())) + .etl_no_io(sc, sc_sql, loaded_inputs=loaded_inputs)[0] + .toPandas() + .to_dict(orient="records") + ) assert actual == expected diff --git a/tests/jobs/examples/ex1_full_sql_job_test.py b/tests/jobs/examples/ex1_full_sql_job_test.py index d15d3397..e5f46bc9 100644 --- a/tests/jobs/examples/ex1_full_sql_job_test.py +++ b/tests/jobs/examples/ex1_full_sql_job_test.py @@ -2,30 +2,43 @@ from yaetos.sql_job import Job -class Test_Job(object): +class TestJob: def test_transform(self, sc, sc_sql, ss, get_pre_jargs): - some_events = ss.read.json(sc.parallelize([ - {'session_id': 1234, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1234, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1235, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1236, 'action': 'other', 'n_results': 1}, - ])) + some_events = ss.read.json( + sc.parallelize( + [ + {"session_id": 1234, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1234, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1235, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1236, "action": "other", "n_results": 1}, + ] + ) + ) - other_events = ss.read.json(sc.parallelize([ - {'session_id': 1234, 'other': 1}, - {'session_id': 1235, 'other': 1}, - {'session_id': 1236, 'other': 1}, - ])) + other_events = ss.read.json( + sc.parallelize( + [ + {"session_id": 1234, "other": 1}, + {"session_id": 1235, "other": 1}, + {"session_id": 1236, "other": 1}, + ] + ) + ) expected = [ - {'session_id': 1234, 'count_events': 2}, - {'session_id': 1235, 'count_events': 1}, - ] + {"session_id": 1234, "count_events": 2}, + {"session_id": 1235, "count_events": 1}, + ] - sql_file = 'jobs/examples/ex1_full_sql_job.sql' + sql_file = "jobs/examples/ex1_full_sql_job.sql" - loaded_inputs={'some_events': some_events, 'other_events':other_events} + loaded_inputs = {"some_events": some_events, "other_events": other_events} pre_jargs = get_pre_jargs(loaded_inputs.keys()) - pre_jargs['cmd_args']['sql_file'] = sql_file - actual = Job(pre_jargs=pre_jargs).etl_no_io(sc, sc_sql, loaded_inputs=loaded_inputs)[0].toPandas().to_dict(orient='records') + pre_jargs["cmd_args"]["sql_file"] = sql_file + actual = ( + Job(pre_jargs=pre_jargs) + .etl_no_io(sc, sc_sql, loaded_inputs=loaded_inputs)[0] + .toPandas() + .to_dict(orient="records") + ) assert actual == expected diff --git a/tests/jobs/examples/ex4_dependency1_job_test.py b/tests/jobs/examples/ex4_dependency1_job_test.py index 31b5a594..7e51ca19 100644 --- a/tests/jobs/examples/ex4_dependency1_job_test.py +++ b/tests/jobs/examples/ex4_dependency1_job_test.py @@ -2,22 +2,31 @@ from jobs.examples.ex4_dependency1_job import Job -class Test_Job(object): +class TestJob: def test_transform(self, sc, sc_sql, ss, get_pre_jargs): - some_events = ss.read.json(sc.parallelize([ - {'session_id': 1}, - {'session_id': 12}, - {'session_id': 123}, - {'session_id': 1234}, - ])) + some_events = ss.read.json( + sc.parallelize( + [ + {"session_id": 1}, + {"session_id": 12}, + {"session_id": 123}, + {"session_id": 1234}, + ] + ) + ) expected = [ - {'session_id': 1, 'session_length': 1}, - {'session_id': 12, 'session_length': 2}, - {'session_id': 123, 'session_length': 3}, - {'session_id': 1234, 'session_length': 4}, - ] + {"session_id": 1, "session_length": 1}, + {"session_id": 12, "session_length": 2}, + {"session_id": 123, "session_length": 3}, + {"session_id": 1234, "session_length": 4}, + ] - loaded_inputs={'some_events': some_events} - actual = Job(pre_jargs=get_pre_jargs(loaded_inputs.keys())).etl_no_io(sc, sc_sql, loaded_inputs=loaded_inputs)[0].toPandas().to_dict(orient='records') + loaded_inputs = {"some_events": some_events} + actual = ( + Job(pre_jargs=get_pre_jargs(loaded_inputs.keys())) + .etl_no_io(sc, sc_sql, loaded_inputs=loaded_inputs)[0] + .toPandas() + .to_dict(orient="records") + ) assert actual == expected diff --git a/tests/jobs/examples/ex4_dependency2_job_test.py b/tests/jobs/examples/ex4_dependency2_job_test.py index 47ceb170..8e89cf89 100644 --- a/tests/jobs/examples/ex4_dependency2_job_test.py +++ b/tests/jobs/examples/ex4_dependency2_job_test.py @@ -2,22 +2,31 @@ from jobs.examples.ex4_dependency2_job import Job -class Test_Job(object): +class TestJob: def test_transform(self, sc, sc_sql, ss, get_pre_jargs): - some_events = ss.read.json(sc.parallelize([ - {'session_id': 1, 'session_length': 1}, - {'session_id': 12, 'session_length': 2}, - {'session_id': 123, 'session_length': 3}, - {'session_id': 1234, 'session_length': 4}, - ])) + some_events = ss.read.json( + sc.parallelize( + [ + {"session_id": 1, "session_length": 1}, + {"session_id": 12, "session_length": 2}, + {"session_id": 123, "session_length": 3}, + {"session_id": 1234, "session_length": 4}, + ] + ) + ) expected = [ - {'session_id': 1, 'session_length': 1, 'doubled_length': 2}, - {'session_id': 12, 'session_length': 2, 'doubled_length': 4}, - {'session_id': 123, 'session_length': 3, 'doubled_length': 6}, - {'session_id': 1234, 'session_length': 4, 'doubled_length': 8}, - ] + {"session_id": 1, "session_length": 1, "doubled_length": 2}, + {"session_id": 12, "session_length": 2, "doubled_length": 4}, + {"session_id": 123, "session_length": 3, "doubled_length": 6}, + {"session_id": 1234, "session_length": 4, "doubled_length": 8}, + ] - loaded_inputs={'some_events': some_events} - actual = Job(pre_jargs=get_pre_jargs(loaded_inputs.keys())).etl_no_io(sc, sc_sql, loaded_inputs=loaded_inputs)[0].toPandas().to_dict(orient='records') + loaded_inputs = {"some_events": some_events} + actual = ( + Job(pre_jargs=get_pre_jargs(loaded_inputs.keys())) + .etl_no_io(sc, sc_sql, loaded_inputs=loaded_inputs)[0] + .toPandas() + .to_dict(orient="records") + ) assert actual == expected diff --git a/tests/jobs/examples/ex7_hybrid_pandas_spark_job_test.py b/tests/jobs/examples/ex7_hybrid_pandas_spark_job_test.py index f502d7b0..cf5356e2 100644 --- a/tests/jobs/examples/ex7_hybrid_pandas_spark_job_test.py +++ b/tests/jobs/examples/ex7_hybrid_pandas_spark_job_test.py @@ -2,27 +2,40 @@ from jobs.examples.ex7_hybrid_pandas_spark_job import Job -class Test_Job(object): +class TestJob: def test_transform(self, sc, sc_sql, ss, get_pre_jargs): - some_events = ss.read.json(sc.parallelize([ - {'session_id': 1234, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1234, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1235, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1236, 'action': 'other', 'n_results': 1}, - ])) + some_events = ss.read.json( + sc.parallelize( + [ + {"session_id": 1234, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1234, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1235, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1236, "action": "other", "n_results": 1}, + ] + ) + ) - other_events = ss.read.json(sc.parallelize([ - {'session_id': 1234, 'other': 1}, - {'session_id': 1235, 'other': 1}, - {'session_id': 1236, 'other': 1}, - ])) + other_events = ss.read.json( + sc.parallelize( + [ + {"session_id": 1234, "other": 1}, + {"session_id": 1235, "other": 1}, + {"session_id": 1236, "other": 1}, + ] + ) + ) expected = [ - {'session_id': '1234', 'count_events': 2}, - {'session_id': '1235', 'count_events': 1}, + {"session_id": "1234", "count_events": 2}, + {"session_id": "1235", "count_events": 1}, # only diff with ex1_framework_job is session_id being str instead of int. - ] + ] - loaded_inputs={'some_events': some_events, 'other_events':other_events} - actual = Job(pre_jargs=get_pre_jargs(loaded_inputs.keys())).etl_no_io(sc, sc_sql, loaded_inputs=loaded_inputs)[0].toPandas().to_dict(orient='records') + loaded_inputs = {"some_events": some_events, "other_events": other_events} + actual = ( + Job(pre_jargs=get_pre_jargs(loaded_inputs.keys())) + .etl_no_io(sc, sc_sql, loaded_inputs=loaded_inputs)[0] + .toPandas() + .to_dict(orient="records") + ) assert actual == expected diff --git a/tests/jobs/examples/ex7_pandas_job_test.py b/tests/jobs/examples/ex7_pandas_job_test.py index f676d390..bba05fba 100644 --- a/tests/jobs/examples/ex7_pandas_job_test.py +++ b/tests/jobs/examples/ex7_pandas_job_test.py @@ -3,27 +3,35 @@ import pandas as pd -class Test_Job(object): +class TestJob: def test_transform(self, get_pre_jargs): - some_events = pd.DataFrame([ - {'session_id': 1234, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1234, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1235, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1236, 'action': 'other', 'n_results': 1}, - ]) + some_events = pd.DataFrame( + [ + {"session_id": 1234, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1234, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1235, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1236, "action": "other", "n_results": 1}, + ] + ) - other_events = pd.DataFrame([ - {'session_id': 1234, 'other': 1}, - {'session_id': 1235, 'other': 1}, - {'session_id': 1236, 'other': 1}, - ]) + other_events = pd.DataFrame( + [ + {"session_id": 1234, "other": 1}, + {"session_id": 1235, "other": 1}, + {"session_id": 1236, "other": 1}, + ] + ) expected = [ - {'session_id': 1234, 'count_events': 2}, - {'session_id': 1235, 'count_events': 1}, + {"session_id": 1234, "count_events": 2}, + {"session_id": 1235, "count_events": 1}, # only diff with ex1_framework_job is session_id being str instead of int. - ] + ] - loaded_inputs={'some_events': some_events, 'other_events':other_events} - actual = Job(pre_jargs=get_pre_jargs(loaded_inputs.keys())).etl_no_io(sc=None, sc_sql=None, loaded_inputs=loaded_inputs)[0].to_dict(orient='records') + loaded_inputs = {"some_events": some_events, "other_events": other_events} + actual = ( + Job(pre_jargs=get_pre_jargs(loaded_inputs.keys())) + .etl_no_io(sc=None, sc_sql=None, loaded_inputs=loaded_inputs)[0] + .to_dict(orient="records") + ) assert actual == expected diff --git a/tests/yaetos/etl_utils_test.py b/tests/yaetos/etl_utils_test.py index 164f160c..e93ec3c6 100644 --- a/tests/yaetos/etl_utils_test.py +++ b/tests/yaetos/etl_utils_test.py @@ -2,158 +2,244 @@ from pandas.testing import assert_frame_equal import pandas as pd import numpy as np -from yaetos.etl_utils import ETL_Base, Commandliner, \ - Period_Builder, Job_Args_Parser, Job_Yml_Parser, Flow, \ - get_job_class, LOCAL_APP_FOLDER, JOBS_METADATA_FILE - - -class Test_ETL_Base(object): +from yaetos.etl_utils import ( + ETLBase, + Commandliner, + Period_Builder, + Job_Args_Parser, + Job_Yml_Parser, + Flow, + get_job_class, + LOCAL_APP_FOLDER, + JOBS_METADATA_FILE, +) + + +class TestETLBase: def test_check_pk(self, sc, sc_sql, ss): - sdf = ss.read.json(sc.parallelize([ - {'id': 1}, - {'id': 2}, - {'id': 3}])) + sdf = ss.read.json(sc.parallelize([{"id": 1}, {"id": 2}, {"id": 3}])) - pks = ['id'] - assert ETL_Base.check_pk(sdf, pks) is True + pks = ["id"] + assert ETLBase.check_pk(sdf, pks) is True - sdf = ss.read.json(sc.parallelize([ - {'id': 1}, - {'id': 2}, - {'id': 2}])) + sdf = ss.read.json(sc.parallelize([{"id": 1}, {"id": 2}, {"id": 2}])) - pks = ['id'] - assert ETL_Base.check_pk(sdf, pks) is False + pks = ["id"] + assert ETLBase.check_pk(sdf, pks) is False # def test_set_py_job(self, get_pre_jargs): # works locally but not from CI tool, where LOCAL_APP_FOLDER is different. - # py_job = ETL_Base(pre_jargs=get_pre_jargs({})).set_py_job() + # py_job = ETLBase(pre_jargs=get_pre_jargs({})).set_py_job() # assert py_job == LOCAL_APP_FOLDER+'yaetos/etl_utils.py' # file is the one that starts execution, typically the job python file. def test_load_inputs(self, sc, sc_sql, ss, get_pre_jargs): """Confirming load_inputs acts as a passthrough""" - sdf = ss.read.json(sc.parallelize([ - {'id': 1}, - {'id': 2}, - {'id': 3}])) - loaded_inputs = {'input1':sdf} + sdf = ss.read.json(sc.parallelize([{"id": 1}, {"id": 2}, {"id": 3}])) + loaded_inputs = {"input1": sdf} app_args_expected = loaded_inputs - assert ETL_Base(pre_jargs=get_pre_jargs(loaded_inputs.keys())).load_inputs(loaded_inputs) == app_args_expected + assert ( + ETLBase(pre_jargs=get_pre_jargs(loaded_inputs.keys())).load_inputs( + loaded_inputs + ) + == app_args_expected + ) def test_get_max_timestamp(self, sc, sc_sql, ss, get_pre_jargs): - sdf = ss.read.json(sc.parallelize([ - {'id': 1, 'timestamp': '2020-01-01'}, - {'id': 2, 'timestamp': '2020-01-02'}, - {'id': 3, 'timestamp': '2020-01-03'}])) + sdf = ss.read.json( + sc.parallelize( + [ + {"id": 1, "timestamp": "2020-01-01"}, + {"id": 2, "timestamp": "2020-01-02"}, + {"id": 3, "timestamp": "2020-01-03"}, + ] + ) + ) pre_jargs_over = { - 'defaults_args': { - 'inputs':{}, - 'output': {'inc_field': 'timestamp', 'type':None}}} - max_timestamp_expected = '2020-01-03' - assert ETL_Base(pre_jargs=get_pre_jargs(pre_jargs_over=pre_jargs_over)).get_max_timestamp(sdf) == max_timestamp_expected + "defaults_args": { + "inputs": {}, + "output": {"inc_field": "timestamp", "type": None}, + } + } + max_timestamp_expected = "2020-01-03" + assert ( + ETLBase( + pre_jargs=get_pre_jargs(pre_jargs_over=pre_jargs_over) + ).get_max_timestamp(sdf) + == max_timestamp_expected + ) -class Test_Period_Builder(object): +class TestPeriodBuilder: def test_get_last_day(self): from datetime import datetime - as_of_date = datetime.strptime("2021-01-02", '%Y-%m-%d') + + as_of_date = datetime.strptime("2021-01-02", "%Y-%m-%d") last_day_real = Period_Builder.get_last_day(as_of_date) last_day_expected = "2021-01-01" assert last_day_real == last_day_expected def test_get_first_to_last_day(self): from datetime import datetime + first_day = "2021-01-01" - as_of_date = datetime.strptime("2021-01-05", '%Y-%m-%d') + as_of_date = datetime.strptime("2021-01-05", "%Y-%m-%d") period_real = Period_Builder.get_first_to_last_day(first_day, as_of_date) period_expected = ["2021-01-01", "2021-01-02", "2021-01-03", "2021-01-04"] assert period_real == period_expected def test_get_last_output_to_last_day(self): from datetime import datetime + first_day_input = "2021-01-01" last_run_period = "2021-01-03" - as_of_date = datetime.strptime("2021-01-08", '%Y-%m-%d') - period_real = Period_Builder().get_last_output_to_last_day(last_run_period, first_day_input, as_of_date) + as_of_date = datetime.strptime("2021-01-08", "%Y-%m-%d") + period_real = Period_Builder().get_last_output_to_last_day( + last_run_period, first_day_input, as_of_date + ) period_expected = ["2021-01-04", "2021-01-05", "2021-01-06", "2021-01-07"] assert period_real == period_expected -class Test_Job_Yml_Parser(object): +class TestJobYmlParser: def test_set_py_job_from_name(self): - py_job = Job_Yml_Parser.set_py_job_from_name('some_job_name') - assert py_job == 'jobs/some_job_name' + py_job = Job_Yml_Parser.set_py_job_from_name("some_job_name") + assert py_job == "jobs/some_job_name" def test_set_job_name_from_file(self): - job_name = Job_Yml_Parser.set_job_name_from_file('jobs/some/file.py') - assert job_name == 'some/file.py' + job_name = Job_Yml_Parser.set_job_name_from_file("jobs/some/file.py") + assert job_name == "some/file.py" - job_name = Job_Yml_Parser.set_job_name_from_file(LOCAL_APP_FOLDER+'jobs/some/file.py') - assert job_name == 'some/file.py' + job_name = Job_Yml_Parser.set_job_name_from_file( + LOCAL_APP_FOLDER + "jobs/some/file.py" + ) + assert job_name == "some/file.py" # def test_set_sql_file_from_name(self) # to be added # Job_Yml_Parser.set_sql_file_from_name() -class Test_Job_Args_Parser(object): +class TestJob_Args_Parser: def test_no_param_override(self): - defaults_args = {'mode': 'dev_local', 'deploy':'code', 'output': {'path':'n/a', 'type': 'csv'}} - expected_args = {**{'inputs': {}, 'is_incremental': False}, **defaults_args} - - jargs = Job_Args_Parser(defaults_args=defaults_args, yml_args={}, job_args={}, cmd_args={}) + defaults_args = { + "mode": "dev_local", + "deploy": "code", + "output": {"path": "n/a", "type": "csv"}, + } + expected_args = {**{"inputs": {}, "is_incremental": False}, **defaults_args} + + jargs = Job_Args_Parser( + defaults_args=defaults_args, yml_args={}, job_args={}, cmd_args={} + ) assert jargs.merged_args == expected_args -class Test_Flow(object): +class TestFlow: def test_create_connections_jobs(self, sc, sc_sql): cmd_args = { - 'deploy': 'none', - 'mode': 'dev_local', - 'job_param_file': JOBS_METADATA_FILE, - 'job_name': 'examples/ex4_dependency2_job.py', - 'storage': 'local', - } - launch_jargs = Job_Args_Parser(defaults_args={}, yml_args=None, job_args={}, cmd_args=cmd_args, loaded_inputs={}) - connection_real = Flow.create_connections_jobs(launch_jargs.storage, launch_jargs.merged_args) + "deploy": "none", + "mode": "dev_local", + "job_param_file": JOBS_METADATA_FILE, + "job_name": "examples/ex4_dependency2_job.py", + "storage": "local", + } + launch_jargs = Job_Args_Parser( + defaults_args={}, + yml_args=None, + job_args={}, + cmd_args=cmd_args, + loaded_inputs={}, + ) + connection_real = Flow.create_connections_jobs( + launch_jargs.storage, launch_jargs.merged_args + ) connection_expected = pd.DataFrame( - columns=['source_job', 'destination_job'], - data=np.array([ - ['examples/ex0_extraction_job.py', 'examples/ex1_full_sql_job.sql'], - ['examples/ex0_extraction_job.py', 'examples/ex1_frameworked_job.py'], - ['examples/ex0_extraction_job.py', 'job_using_generic_template'], - ['examples/ex3_incremental_prep_job.py', 'examples/ex3_incremental_job.py'], - ['examples/ex4_dependency1_job.py', 'examples/ex4_dependency2_job.py'], - ['examples/ex4_dependency2_job.py', 'examples/ex4_dependency3_job.sql'], - ['examples/ex4_dependency1_job.py', 'examples/ex4_dependency3_job.sql'], - ['examples/ex4_dependency3_job.sql', 'examples/ex4_dependency4_job.py'], - ['examples/ex0_extraction_job.py', 'examples/ex7_extraction_small_job.py'], - ]), - ) + columns=["source_job", "destination_job"], + data=np.array( + [ + ["examples/ex0_extraction_job.py", "examples/ex1_full_sql_job.sql"], + [ + "examples/ex0_extraction_job.py", + "examples/ex1_frameworked_job.py", + ], + ["examples/ex0_extraction_job.py", "job_using_generic_template"], + [ + "examples/ex3_incremental_prep_job.py", + "examples/ex3_incremental_job.py", + ], + [ + "examples/ex4_dependency1_job.py", + "examples/ex4_dependency2_job.py", + ], + [ + "examples/ex4_dependency2_job.py", + "examples/ex4_dependency3_job.sql", + ], + [ + "examples/ex4_dependency1_job.py", + "examples/ex4_dependency3_job.sql", + ], + [ + "examples/ex4_dependency3_job.sql", + "examples/ex4_dependency4_job.py", + ], + [ + "examples/ex0_extraction_job.py", + "examples/ex7_extraction_small_job.py", + ], + ] + ), + ) assert_frame_equal(connection_real, connection_expected) def test_create_global_graph(self): import networkx as nx + df = pd.DataFrame( - columns=['source_job', 'destination_job'], - data=np.array([ - ['examples/ex3_incremental_prep_job.py', 'examples/ex3_incremental_job.py'], - ['examples/ex4_dependency1_job.py', 'examples/ex4_dependency2_job.py'], - ['examples/ex4_dependency2_job.py', 'examples/ex4_dependency3_job.sql'], - ['examples/ex4_dependency1_job.py', 'examples/ex4_dependency3_job.sql'], - ['examples/ex4_dependency3_job.sql', 'examples/ex4_dependency4_job.py']]), - ) + columns=["source_job", "destination_job"], + data=np.array( + [ + [ + "examples/ex3_incremental_prep_job.py", + "examples/ex3_incremental_job.py", + ], + [ + "examples/ex4_dependency1_job.py", + "examples/ex4_dependency2_job.py", + ], + [ + "examples/ex4_dependency2_job.py", + "examples/ex4_dependency3_job.sql", + ], + [ + "examples/ex4_dependency1_job.py", + "examples/ex4_dependency3_job.sql", + ], + [ + "examples/ex4_dependency3_job.sql", + "examples/ex4_dependency4_job.py", + ], + ] + ), + ) nx_real = Flow.create_global_graph(df) nx_expected = { - 'examples/ex3_incremental_prep_job.py': {'examples/ex3_incremental_job.py': {}}, - 'examples/ex3_incremental_job.py': {}, - 'examples/ex4_dependency1_job.py': {'examples/ex4_dependency2_job.py': {}, 'examples/ex4_dependency3_job.sql': {}}, - 'examples/ex4_dependency2_job.py': {'examples/ex4_dependency3_job.sql': {}}, - 'examples/ex4_dependency3_job.sql': {'examples/ex4_dependency4_job.py': {}}, - 'examples/ex4_dependency4_job.py': {} - } + "examples/ex3_incremental_prep_job.py": { + "examples/ex3_incremental_job.py": {} + }, + "examples/ex3_incremental_job.py": {}, + "examples/ex4_dependency1_job.py": { + "examples/ex4_dependency2_job.py": {}, + "examples/ex4_dependency3_job.sql": {}, + }, + "examples/ex4_dependency2_job.py": {"examples/ex4_dependency3_job.sql": {}}, + "examples/ex4_dependency3_job.sql": {"examples/ex4_dependency4_job.py": {}}, + "examples/ex4_dependency4_job.py": {}, + } # Other way to check graph equality: nx.is_isomorphic(nx_real, nx_expected) assert nx.to_dict_of_dicts(nx_real) == nx_expected def test_get_job_class(): - Job = get_job_class(py_job='jobs/examples/ex1_frameworked_job.py') # must be real job - assert issubclass(Job, ETL_Base) + Job = get_job_class( + py_job="jobs/examples/ex1_frameworked_job.py" + ) # must be real job + assert issubclass(Job, ETLBase) diff --git a/yaetos/daily_incremental_job.py b/yaetos/daily_incremental_job.py index b20711f4..78200fe0 100644 --- a/yaetos/daily_incremental_job.py +++ b/yaetos/daily_incremental_job.py @@ -1,41 +1,53 @@ """OBSOLETE. Will be deleted.""" -from yaetos.etl_utils import ETL_Base +from yaetos.etl_utils import ETLBase from datetime import datetime, date from dateutil.relativedelta import relativedelta -class ETL_Daily_Incremental_Base(ETL_Base): +class ETL_Daily_Incremental_Base(ETLBase): def __init__(self, pre_jargs={}, jargs=None, loaded_inputs={}): - super(ETL_Daily_Incremental_Base, self).__init__(pre_jargs, jargs, loaded_inputs) + super(ETL_Daily_Incremental_Base, self).__init__( + pre_jargs, jargs, loaded_inputs + ) self.last_attempted_period = None def transform(self, **loaded_datasets): return self.get_transform_inc_from_last_days(**loaded_datasets) def transform_inc(self, period, **loaded_datasets): - """ The function that needs to be overriden by each specific job.""" + """The function that needs to be overriden by each specific job.""" raise NotImplementedError def get_transform_inc_from_last_days(self, **loaded_datasets): - """ Incremental assumes last available month from the previous output was fully loaded.""" - first_day = self.jargs.merged_args['first_day'] + """Incremental assumes last available month from the previous output was fully loaded.""" + first_day = self.jargs.merged_args["first_day"] if not self.last_attempted_period: previous_output_max_timestamp = self.get_previous_output_max_timestamp() - self.last_attempted_period = previous_output_max_timestamp.strftime("%Y-%m-%d") if previous_output_max_timestamp else first_day # TODO: if get_output_max_timestamp()=None, means new build, so should delete instance in DBs. + self.last_attempted_period = ( + previous_output_max_timestamp.strftime("%Y-%m-%d") + if previous_output_max_timestamp + else first_day + ) # TODO: if get_output_max_timestamp()=None, means new build, so should delete instance in DBs. - periods = self.get_last_output_to_last_day(self.last_attempted_period, first_day) + periods = self.get_last_output_to_last_day( + self.last_attempted_period, first_day + ) if len(periods) == 0: - self.logger.info('Output up to date. Nothing to run. last processed period={} and last period from now={}'.format(self.last_attempted_period, self.get_last_day())) + self.logger.info( + "Output up to date. Nothing to run. last processed period={} and last period from now={}".format( + self.last_attempted_period, self.get_last_day() + ) + ) self.final_inc = True return None - self.logger.info('Periods remaining to load: {}'.format(periods)) + self.logger.info("Periods remaining to load: {}".format(periods)) period = periods[0] - self.logger.info('Period to be loaded in this run: {}'.format(period)) + self.logger.info("Period to be loaded in this run: {}".format(period)) self.final_inc = period == periods[-1] self.last_attempted_period = period - self.jargs.merged_args['file_tag'] = period + self.jargs.merged_args["file_tag"] = period df = self.transform_inc(period, **loaded_datasets) return df diff --git a/yaetos/deploy.py b/yaetos/deploy.py index 65318919..cba0be3c 100644 --- a/yaetos/deploy.py +++ b/yaetos/deploy.py @@ -24,145 +24,201 @@ from shutil import copyfile import site import yaetos.etl_utils as eu -from yaetos.git_utils import Git_Config_Manager +from yaetos.git_utils import GitConfigManager from yaetos.logger import setup_logging -logger = setup_logging('Deploy') +logger = setup_logging("Deploy") -class DeployPySparkScriptOnAws(object): + +class DeployPySparkScriptOnAws: """ Programmatically deploy a local PySpark script on an AWS cluster """ - SCRIPTS = 'yaetos/scripts/' # TODO: move to etl_utils.py - TMP = 'tmp/files_to_ship/' + + SCRIPTS = "yaetos/scripts/" # TODO: move to etl_utils.py + TMP = "tmp/files_to_ship/" def __init__(self, deploy_args, app_args): logger.info("etl deploy_args: \n{}".format(pformat(deploy_args))) logger.info("etl app_args: \n{}".format(pformat(app_args))) - aws_setup = deploy_args['aws_setup'] + aws_setup = deploy_args["aws_setup"] config = ConfigParser() - assert os.path.isfile(deploy_args['aws_config_file']) - config.read(deploy_args['aws_config_file']) + assert os.path.isfile(deploy_args["aws_config_file"]) + config.read(deploy_args["aws_config_file"]) self.app_args = app_args - self.app_file = app_args['py_job'] # TODO: remove all refs to app_file to be consistent. + self.app_file = app_args[ + "py_job" + ] # TODO: remove all refs to app_file to be consistent. self.aws_setup = aws_setup - self.ec2_key_name = config.get(aws_setup, 'ec2_key_name') - self.s3_region = config.get(aws_setup, 's3_region') - self.user = config.get(aws_setup, 'user') - self.profile_name = config.get(aws_setup, 'profile_name') - self.ec2_subnet_id = config.get(aws_setup, 'ec2_subnet_id') - self.extra_security_gp = config.get(aws_setup, 'extra_security_gp') - self.emr_core_instances = int(app_args.get('emr_core_instances', 1)) # TODO: make this update EMR_Scheduled mode too. + self.ec2_key_name = config.get(aws_setup, "ec2_key_name") + self.s3_region = config.get(aws_setup, "s3_region") + self.user = config.get(aws_setup, "user") + self.profile_name = config.get(aws_setup, "profile_name") + self.ec2_subnet_id = config.get(aws_setup, "ec2_subnet_id") + self.extra_security_gp = config.get(aws_setup, "extra_security_gp") + self.emr_core_instances = int( + app_args.get("emr_core_instances", 1) + ) # TODO: make this update EMR_Scheduled mode too. self.deploy_args = deploy_args - self.ec2_instance_master = app_args.get('ec2_instance_master', 'm5.xlarge') #'m5.12xlarge', # used m3.2xlarge (8 vCPU, 30 Gib RAM), and earlier m3.xlarge (4 vCPU, 15 Gib RAM) - self.ec2_instance_slaves = app_args.get('ec2_instance_slaves', 'm5.xlarge') + self.ec2_instance_master = app_args.get( + "ec2_instance_master", "m5.xlarge" + ) #'m5.12xlarge', # used m3.2xlarge (8 vCPU, 30 Gib RAM), and earlier m3.xlarge (4 vCPU, 15 Gib RAM) + self.ec2_instance_slaves = app_args.get("ec2_instance_slaves", "m5.xlarge") # Paths - self.s3_bucket_logs = config.get(aws_setup, 's3_bucket_logs') - self.metadata_folder = 'pipelines_metadata' - self.pipeline_name = self.generate_pipeline_name(self.deploy_args['mode'], self.app_args['job_name'], self.user) # format: some_job.some_user.20181204.153429 - self.job_log_path = self.get_job_log_path() # format: yaetos/logs/some_job.some_user.20181204.153429 - self.job_log_path_with_bucket = '{}/{}'.format(self.s3_bucket_logs, self.job_log_path) # format: bucket-tempo/yaetos/logs/some_job.some_user.20181204.153429 - self.package_path = self.job_log_path+'/code_package' # format: yaetos/logs/some_job.some_user.20181204.153429/package - self.package_path_with_bucket = self.job_log_path_with_bucket+'/code_package' # format: bucket-tempo/yaetos/logs/some_job.some_user.20181204.153429/package - self.session = boto3.Session(profile_name=self.profile_name) # aka AWS IAM profile - - spark_version = self.deploy_args.get('spark_version', '2.4') + self.s3_bucket_logs = config.get(aws_setup, "s3_bucket_logs") + self.metadata_folder = "pipelines_metadata" + self.pipeline_name = self.generate_pipeline_name( + self.deploy_args["mode"], self.app_args["job_name"], self.user + ) # format: some_job.some_user.20181204.153429 + self.job_log_path = ( + self.get_job_log_path() + ) # format: yaetos/logs/some_job.some_user.20181204.153429 + self.job_log_path_with_bucket = "{}/{}".format( + self.s3_bucket_logs, self.job_log_path + ) # format: bucket-tempo/yaetos/logs/some_job.some_user.20181204.153429 + self.package_path = ( + self.job_log_path + "/code_package" + ) # format: yaetos/logs/some_job.some_user.20181204.153429/package + self.package_path_with_bucket = ( + self.job_log_path_with_bucket + "/code_package" + ) # format: bucket-tempo/yaetos/logs/some_job.some_user.20181204.153429/package + self.session = boto3.Session( + profile_name=self.profile_name + ) # aka AWS IAM profile + + spark_version = self.deploy_args.get("spark_version", "2.4") # import ipdb; ipdb.set_trace() - if spark_version == '2.4': + if spark_version == "2.4": self.emr_version = "emr-5.26.0" # used "emr-5.26.0" successfully for a bit. emr-6.0.0 is latest as of june 2020, first with python3 by default but not supported by AWS Data Pipeline, emr-5.26.0 is latest as of aug 2019 # Was "emr-5.8.0", which was compatible with m3.2xlarge. # TODO: check switching to EMR 5.28 which has improvement to EMR runtime for spark. - elif spark_version == '3.0': + elif spark_version == "3.0": self.emr_version = "emr-6.1.0" # latest is "emr-6.3.0" but latest compatible with AWS Data Piupeline is "emr-6.1.0". # see latest supported emr version by AWS Data Pipeline at https://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-object-emrcluster.html try: - self.git_yml = Git_Config_Manager().get_config_from_git(eu.LOCAL_APP_FOLDER) - Git_Config_Manager().save_yaml(self.git_yml) + self.git_yml = GitConfigManager().get_config_from_git(eu.LOCAL_APP_FOLDER) + GitConfigManager().save_yaml(self.git_yml) except Exception as e: # TODO: get specific exception self.git_yml = None - logger.info("Error saving yml file with git info, with error '{}'.".format(e)) - + logger.info( + "Error saving yml file with git info, with error '{}'.".format(e) + ) def run(self): if self.continue_post_git_check() is False: return False - if self.deploy_args['deploy']=='EMR': + if self.deploy_args["deploy"] == "EMR": self.run_direct() - elif self.deploy_args['deploy'] in ('EMR_Scheduled', 'EMR_DataPipeTest'): + elif self.deploy_args["deploy"] in ("EMR_Scheduled", "EMR_DataPipeTest"): self.run_aws_data_pipeline() - elif self.deploy_args['deploy'] in ('code'): + elif self.deploy_args["deploy"] in ("code"): self.run_push_code() else: raise Exception("Shouldn't get here.") def continue_post_git_check(self): - if self.app_args['mode'] != 'prod_EMR': + if self.app_args["mode"] != "prod_EMR": print('Not pushing as "prod_EMR", so git check ignored') return True elif self.git_yml is None: - print('Code not git controled: git check ignored') + print("Code not git controled: git check ignored") return True - git_yml = {key:value for key, value in self.git_yml.items() if key in ('is_dirty_yaetos', 'is_dirty_current', 'branch_current', 'branch_yaetos')} - if self.git_yml['is_dirty_current'] or self.git_yml['is_dirty_yaetos']: - print('Some changes to your git controled files are not committed to git: {}'.format(git_yml)) - answer = input('Are you sure you want to deploy it ? [y/n] ') - if answer == 'y': - print('Ok, continuing deployment') + git_yml = { + key: value + for key, value in self.git_yml.items() + if key + in ( + "is_dirty_yaetos", + "is_dirty_current", + "branch_current", + "branch_yaetos", + ) + } + if self.git_yml["is_dirty_current"] or self.git_yml["is_dirty_yaetos"]: + print( + "Some changes to your git controled files are not committed to git: {}".format( + git_yml + ) + ) + answer = input("Are you sure you want to deploy it ? [y/n] ") + if answer == "y": + print("Ok, continuing deployment") return True - elif answer == 'n': - print('Ok, cancelling deployment') + elif answer == "n": + print("Ok, cancelling deployment") return False else: - print('Answer not understood, it should be "y" or "n", cancelling deployment') + print( + 'Answer not understood, it should be "y" or "n", cancelling deployment' + ) return False else: - print('Git controled files are clean, continuing with push to prod. Git setup: {}'.format(git_yml)) + print( + "Git controled files are clean, continuing with push to prod. Git setup: {}".format( + git_yml + ) + ) return True def run_push_code(self): logger.info("Pushing code only") self.s3_ops(self.session) - if self.deploy_args.get('push_secrets', False): - self.push_secrets(creds_or_file=self.app_args['connection_file']) # TODO: fix privileges to get creds in dev env + if self.deploy_args.get("push_secrets", False): + self.push_secrets( + creds_or_file=self.app_args["connection_file"] + ) # TODO: fix privileges to get creds in dev env def run_direct(self): """Useful to run job on cluster without bothering with aws data pipeline. Also useful to add steps to existing cluster.""" self.s3_ops(self.session) - if self.deploy_args.get('push_secrets', False): - self.push_secrets(creds_or_file=self.app_args['connection_file']) # TODO: fix privileges to get creds in dev env + if self.deploy_args.get("push_secrets", False): + self.push_secrets( + creds_or_file=self.app_args["connection_file"] + ) # TODO: fix privileges to get creds in dev env # EMR ops - c = self.session.client('emr') + c = self.session.client("emr") clusters = self.get_active_clusters(c) cluster = self.choose_cluster(clusters) - new_cluster = cluster['id'] is None + new_cluster = cluster["id"] is None if new_cluster: print("Starting new cluster") self.start_spark_cluster(c, self.emr_version) - print("cluster name: %s, and id: %s"%(self.pipeline_name, self.cluster_id)) + print( + "cluster name: %s, and id: %s" % (self.pipeline_name, self.cluster_id) + ) self.step_run_setup_scripts(c) else: - print("Reusing existing cluster, name: %s, and id: %s"%(cluster['name'], cluster['id'])) - self.cluster_id = cluster['id'] + print( + "Reusing existing cluster, name: %s, and id: %s" + % (cluster["name"], cluster["id"]) + ) + self.cluster_id = cluster["id"] self.step_run_setup_scripts(c) # Run job self.step_spark_submit(c, self.app_file, self.app_args) # Clean - if new_cluster and not self.deploy_args.get('leave_on') and self.app_args.get('clean_post_run'): # TODO: add clean_post_run in input options. + if ( + new_cluster + and not self.deploy_args.get("leave_on") + and self.app_args.get("clean_post_run") + ): # TODO: add clean_post_run in input options. logger.info("New cluster setup to be deleted after job finishes.") self.describe_status_until_terminated(c) - self.remove_temp_files(s3) # TODO: remove tmp files for existing clusters too but only tmp files for the job + self.remove_temp_files( + s3 + ) # TODO: remove tmp files for existing clusters too but only tmp files for the job def s3_ops(self, session): - s3 = session.resource('s3') + s3 = session.resource("s3") self.temp_bucket_exists(s3) self.tar_python_scripts() self.move_bash_to_local_temp() @@ -170,51 +226,59 @@ def s3_ops(self, session): def get_active_clusters(self, c): response = c.list_clusters( - ClusterStates=['STARTING','BOOTSTRAPPING','RUNNING','WAITING'], - ) - clusters = [(ii+1, item['Id'],item['Name']) for ii, item in enumerate(response['Clusters'])] + ClusterStates=["STARTING", "BOOTSTRAPPING", "RUNNING", "WAITING"], + ) + clusters = [ + (ii + 1, item["Id"], item["Name"]) + for ii, item in enumerate(response["Clusters"]) + ] return clusters def choose_cluster(self, clusters, cluster_id=None): if len(clusters) == 0: - print('No cluster found, will create a new one') - return {'id': None, - 'name': None} + print("No cluster found, will create a new one") + return {"id": None, "name": None} if cluster_id is not None: - print('Cluster_id set by user to {}'.format(cluster_id)) - return {'id': cluster_id, - 'name': None} - - clusters.append((len(clusters)+1, None, 'Create a new cluster')) - print('Clusters found for AWS account "%s":'%(self.aws_setup)) - print('\n'.join(['[%s] %s'%(item[0], item[2]) for item in clusters])) - answer = input('Your choice ? ') - return {'id':clusters[int(answer)-1][1], - 'name':clusters[int(answer)-1][2]} + print("Cluster_id set by user to {}".format(cluster_id)) + return {"id": cluster_id, "name": None} + + clusters.append((len(clusters) + 1, None, "Create a new cluster")) + print('Clusters found for AWS account "%s":' % (self.aws_setup)) + print("\n".join(["[%s] %s" % (item[0], item[2]) for item in clusters])) + answer = input("Your choice ? ") + return { + "id": clusters[int(answer) - 1][1], + "name": clusters[int(answer) - 1][2], + } @staticmethod def generate_pipeline_name(mode, job_name, user): - mode_label = {'dev_EMR':'dev', 'prod_EMR':'prod'}[mode] + mode_label = {"dev_EMR": "dev", "prod_EMR": "prod"}[mode] """Opposite of get_job_name()""" name = "yaetos__{mode_label}__{pname}__{time}".format( mode_label=mode_label, - pname=job_name.replace('.','_d_').replace('/','_s_'), + pname=job_name.replace(".", "_d_").replace("/", "_s_"), # user.replace('.','_'), - time=datetime.now().strftime("%Y%m%dT%H%M%S")) + time=datetime.now().strftime("%Y%m%dT%H%M%S"), + ) print('Pipeline Name "{}":'.format(name)) return name @staticmethod def get_job_name(pipeline_name): """Opposite of generate_pipeline_name()""" - return pipeline_name.split('__')[2].replace('_d_', '.').replace('_s_', '/') if '__' in pipeline_name else None + return ( + pipeline_name.split("__")[2].replace("_d_", ".").replace("_s_", "/") + if "__" in pipeline_name + else None + ) def get_job_log_path(self): - if self.deploy_args.get('mode')=='prod_EMR': - return '{}/jobs_code/production'.format(self.metadata_folder) + if self.deploy_args.get("mode") == "prod_EMR": + return "{}/jobs_code/production".format(self.metadata_folder) else: - return '{}/jobs_code/{}'.format(self.metadata_folder, self.pipeline_name) + return "{}/jobs_code/{}".format(self.metadata_folder, self.pipeline_name) def temp_bucket_exists(self, s3): """ @@ -227,57 +291,83 @@ def temp_bucket_exists(self, s3): except botocore.exceptions.ClientError as e: # If a client error is thrown, then check that it was a 404 error. # If it was a 404 error, then the bucket does not exist. - error_code = int(e.response['Error']['Code']) + error_code = int(e.response["Error"]["Code"]) if error_code == 404: - terminate("Bucket for temporary files does not exist: "+self.s3_bucket_logs+' '+e.message) - terminate("Error while connecting to temporary Bucket: "+self.s3_bucket_logs+' '+e.message) - logger.info("S3 bucket for temporary files exists: "+self.s3_bucket_logs) + terminate( + "Bucket for temporary files does not exist: " + + self.s3_bucket_logs + + " " + + e.message + ) + terminate( + "Error while connecting to temporary Bucket: " + + self.s3_bucket_logs + + " " + + e.message + ) + logger.info("S3 bucket for temporary files exists: " + self.s3_bucket_logs) def tar_python_scripts(self): base = self.get_package_path() output_path = self.TMP + "scripts.tar.gz" # Create tar.gz file - t_file = tarfile.open(output_path, 'w:gz') + t_file = tarfile.open(output_path, "w:gz") # Add config files - if self.app_args['job_param_file']: - t_file.add(self.app_args['job_param_file'], arcname=eu.JOBS_METADATA_FILE) + if self.app_args["job_param_file"]: + t_file.add(self.app_args["job_param_file"], arcname=eu.JOBS_METADATA_FILE) - git_yml = 'conf/git_config.yml' + git_yml = "conf/git_config.yml" if os.path.isfile(git_yml): t_file.add(git_yml, arcname=git_yml) # ./yaetos files # TODO: check a way to deploy the yaetos code locally for testing. - files = os.listdir(base+'yaetos/') + files = os.listdir(base + "yaetos/") for f in files: - t_file.add(base+'yaetos/' + f, arcname='yaetos/' + f, filter=lambda obj: obj if obj.name.endswith('.py') else None) + t_file.add( + base + "yaetos/" + f, + arcname="yaetos/" + f, + filter=lambda obj: obj if obj.name.endswith(".py") else None, + ) # ./libs files # TODO: get better way to walk down tree (reuse walk from below) - files = os.listdir(base+'yaetos/libs/') + files = os.listdir(base + "yaetos/libs/") for f in files: - t_file.add(base+'yaetos/libs/' + f, arcname='yaetos/libs/' + f, filter=lambda obj: obj if obj.name.endswith('.py') else None) + t_file.add( + base + "yaetos/libs/" + f, + arcname="yaetos/libs/" + f, + filter=lambda obj: obj if obj.name.endswith(".py") else None, + ) - files = os.listdir(base+'yaetos/libs/analysis_toolkit/') + files = os.listdir(base + "yaetos/libs/analysis_toolkit/") for f in files: - t_file.add(base+'yaetos/libs/analysis_toolkit/' + f, arcname='yaetos/libs/analysis_toolkit/' + f, filter=lambda obj: obj if obj.name.endswith('.py') else None) + t_file.add( + base + "yaetos/libs/analysis_toolkit/" + f, + arcname="yaetos/libs/analysis_toolkit/" + f, + filter=lambda obj: obj if obj.name.endswith(".py") else None, + ) - files = os.listdir(base+'yaetos/libs/python_db_connectors/') + files = os.listdir(base + "yaetos/libs/python_db_connectors/") for f in files: - t_file.add(base+'yaetos/libs/python_db_connectors/' + f, arcname='yaetos/libs/python_db_connectors/' + f, filter=lambda obj: obj if obj.name.endswith('.py') else None) + t_file.add( + base + "yaetos/libs/python_db_connectors/" + f, + arcname="yaetos/libs/python_db_connectors/" + f, + filter=lambda obj: obj if obj.name.endswith(".py") else None, + ) # ./jobs files and folders # TODO: extract code below in external function. files = [] - for (dirpath, dirnames, filenames) in os.walk(self.app_args['jobs_folder']): + for (dirpath, dirnames, filenames) in os.walk(self.app_args["jobs_folder"]): for file in filenames: if file.endswith(".py") or file.endswith(".sql"): path = os.path.join(dirpath, file) - dir_tar = dirpath[len(self.app_args['jobs_folder']):] + dir_tar = dirpath[len(self.app_args["jobs_folder"]) :] path_tar = os.path.join(eu.JOB_FOLDER, dir_tar, file) - files.append((path,path_tar)) + files.append((path, path_tar)) for f, f_arc in files: t_file.add(f, arcname=f_arc) @@ -289,41 +379,71 @@ def tar_python_scripts(self): def move_bash_to_local_temp(self): base = self.get_package_path() - for item in ['setup_master.sh', 'setup_master_alt.sh', 'setup_nodes.sh', 'setup_nodes_alt.sh', 'terminate_idle_cluster.sh']: - copyfile(base+self.SCRIPTS+item, self.TMP+item) + for item in [ + "setup_master.sh", + "setup_master_alt.sh", + "setup_nodes.sh", + "setup_nodes_alt.sh", + "terminate_idle_cluster.sh", + ]: + copyfile(base + self.SCRIPTS + item, self.TMP + item) logger.info("Added all EMR setup files to {}".format(self.TMP)) def get_package_path(self): """ Getting the package path depending on whether the core code is coding from lib (through pip install) or from local repo (for faster dev iterations). """ - if self.app_args['code_source'] == 'lib': + if self.app_args["code_source"] == "lib": bases = site.getsitepackages() - if len(bases)>1: - logger.info("There is more than one source of code to ship to EMR '{}'. Will continue with the first one.".format(bases)) - base = bases[0] + '/' - elif self.app_args['code_source'] == 'repo': + if len(bases) > 1: + logger.info( + "There is more than one source of code to ship to EMR '{}'. Will continue with the first one.".format( + bases + ) + ) + base = bases[0] + "/" + elif self.app_args["code_source"] == "repo": base = eu.LOCAL_APP_FOLDER - logger.info("Source of yaetos code to be shipped: {}".format(base+'yaetos/')) + logger.info("Source of yaetos code to be shipped: {}".format(base + "yaetos/")) return base def upload_temp_files(self, s3): """ Move the PySpark + bash scripts to the S3 bucket we use to store temporary files """ - setup_master = 'setup_master.sh' if self.deploy_args.get('spark_version', '2.4') == '2.4' else '/setup_master_alt.sh' - setup_nodes = 'setup_nodes.sh' if self.deploy_args.get('spark_version', '2.4') == '2.4' else '/setup_nodes_alt.sh' + setup_master = ( + "setup_master.sh" + if self.deploy_args.get("spark_version", "2.4") == "2.4" + else "/setup_master_alt.sh" + ) + setup_nodes = ( + "setup_nodes.sh" + if self.deploy_args.get("spark_version", "2.4") == "2.4" + else "/setup_nodes_alt.sh" + ) # Looping through all 4 steps below doesn't work (Fails silently) so done 1 by 1. - s3.Object(self.s3_bucket_logs, self.package_path + '/setup_master.sh')\ - .put(Body=open(self.TMP+setup_master, 'rb'), ContentType='text/x-sh') - s3.Object(self.s3_bucket_logs, self.package_path + '/setup_nodes.sh')\ - .put(Body=open(self.TMP+setup_nodes, 'rb'), ContentType='text/x-sh') - s3.Object(self.s3_bucket_logs, self.package_path + '/terminate_idle_cluster.sh')\ - .put(Body=open(self.TMP+'terminate_idle_cluster.sh', 'rb'), ContentType='text/x-sh') - s3.Object(self.s3_bucket_logs, self.package_path + '/scripts.tar.gz')\ - .put(Body=open(self.TMP+'scripts.tar.gz', 'rb'), ContentType='application/x-tar') - logger.info("Uploaded job files (scripts.tar.gz, {}, {}, terminate_idle_cluster.sh) to bucket path '{}/{}'".format(setup_master, setup_nodes, self.s3_bucket_logs, self.package_path)) + s3.Object(self.s3_bucket_logs, self.package_path + "/setup_master.sh").put( + Body=open(self.TMP + setup_master, "rb"), ContentType="text/x-sh" + ) + s3.Object(self.s3_bucket_logs, self.package_path + "/setup_nodes.sh").put( + Body=open(self.TMP + setup_nodes, "rb"), ContentType="text/x-sh" + ) + s3.Object( + self.s3_bucket_logs, self.package_path + "/terminate_idle_cluster.sh" + ).put( + Body=open(self.TMP + "terminate_idle_cluster.sh", "rb"), + ContentType="text/x-sh", + ) + s3.Object(self.s3_bucket_logs, self.package_path + "/scripts.tar.gz").put( + Body=open(self.TMP + "scripts.tar.gz", "rb"), + ContentType="application/x-tar", + ) + logger.info( + "Uploaded job files (scripts.tar.gz, {}, {}, terminate_idle_cluster.sh) to bucket path '{}/{}'".format( + setup_master, setup_nodes, self.s3_bucket_logs, self.package_path + ) + ) return True def remove_temp_files(self, s3): @@ -336,86 +456,106 @@ def remove_temp_files(self, s3): for key in bucket.objects.all(): if key.key.startswith(self.pipeline_name) is True: key.delete() - logger.info("Removed '{}' from bucket for temporary files".format(key.key)) + logger.info( + "Removed '{}' from bucket for temporary files".format(key.key) + ) def start_spark_cluster(self, c, emr_version): """ :param c: EMR client :return: """ - instance_groups = [{ - 'Name': 'EmrMaster', - 'InstanceRole': 'MASTER', - 'InstanceType': self.ec2_instance_master, - 'InstanceCount': 1, - }] + instance_groups = [ + { + "Name": "EmrMaster", + "InstanceRole": "MASTER", + "InstanceType": self.ec2_instance_master, + "InstanceCount": 1, + } + ] if self.emr_core_instances != 0: - instance_groups += [{ - 'Name': 'EmrCore', - 'InstanceRole': 'CORE', - 'InstanceType': self.ec2_instance_slaves, - 'InstanceCount': self.emr_core_instances, - }] + instance_groups += [ + { + "Name": "EmrCore", + "InstanceRole": "CORE", + "InstanceType": self.ec2_instance_slaves, + "InstanceCount": self.emr_core_instances, + } + ] response = c.run_job_flow( Name=self.pipeline_name, - LogUri="s3://{}/{}/manual_run_logs/".format(self.s3_bucket_logs, self.metadata_folder), + LogUri="s3://{}/{}/manual_run_logs/".format( + self.s3_bucket_logs, self.metadata_folder + ), ReleaseLabel=emr_version, Instances={ - 'InstanceGroups': instance_groups, - 'Ec2KeyName': self.ec2_key_name, - 'KeepJobFlowAliveWhenNoSteps': self.deploy_args.get('leave_on', False), - 'Ec2SubnetId': self.ec2_subnet_id, + "InstanceGroups": instance_groups, + "Ec2KeyName": self.ec2_key_name, + "KeepJobFlowAliveWhenNoSteps": self.deploy_args.get("leave_on", False), + "Ec2SubnetId": self.ec2_subnet_id, # 'AdditionalMasterSecurityGroups': self.extra_security_gp, # TODO : make optional in future. "[self.extra_security_gp] if self.extra_security_gp else []" doesn't work. }, - Applications=[{'Name': 'Hadoop'}, {'Name': 'Spark'}], + Applications=[{"Name": "Hadoop"}, {"Name": "Spark"}], Configurations=[ - { # Section to force python3 since emr-5.x uses python2 by default. - "Classification": "spark-env", - "Configurations": [{ - "Classification": "export", - "Properties": {"PYSPARK_PYTHON": "/usr/bin/python3"} - }] + { # Section to force python3 since emr-5.x uses python2 by default. + "Classification": "spark-env", + "Configurations": [ + { + "Classification": "export", + "Properties": {"PYSPARK_PYTHON": "/usr/bin/python3"}, + } + ], }, # { # Section to add jars (redshift...), not used for now, since passed in spark-submit args. # "Classification": "spark-defaults", # "Properties": { "spark.jars": ["/home/hadoop/redshift_tbd.jar"], "spark.driver.memory": "40G", "maximizeResourceAllocation": "true"}, # } ], - JobFlowRole='EMR_EC2_DefaultRole', - ServiceRole='EMR_DefaultRole', + JobFlowRole="EMR_EC2_DefaultRole", + ServiceRole="EMR_DefaultRole", VisibleToAllUsers=True, - BootstrapActions=[{ - 'Name': 'setup_nodes', - 'ScriptBootstrapAction': { - 'Path': 's3n://{}/setup_nodes.sh'.format(self.package_path_with_bucket), - 'Args': [] - } - }], - ) + BootstrapActions=[ + { + "Name": "setup_nodes", + "ScriptBootstrapAction": { + "Path": "s3n://{}/setup_nodes.sh".format( + self.package_path_with_bucket + ), + "Args": [], + }, + } + ], + ) # Process response to determine if Spark cluster was started, and if so, the JobFlowId of the cluster - response_code = response['ResponseMetadata']['HTTPStatusCode'] - if response['ResponseMetadata']['HTTPStatusCode'] == 200: - self.cluster_id = response['JobFlowId'] + response_code = response["ResponseMetadata"]["HTTPStatusCode"] + if response["ResponseMetadata"]["HTTPStatusCode"] == 200: + self.cluster_id = response["JobFlowId"] else: - terminate("Could not create EMR cluster (status code {})".format(response_code)) + terminate( + "Could not create EMR cluster (status code {})".format(response_code) + ) - logger.info("Created Spark EMR cluster ({}) with cluster_id {}".format(emr_version, self.cluster_id)) + logger.info( + "Created Spark EMR cluster ({}) with cluster_id {}".format( + emr_version, self.cluster_id + ) + ) def describe_status_until_terminated(self, c): """ :param c: :return: """ - print('Waiting for job to finish on cluster') + print("Waiting for job to finish on cluster") stop = False while stop is False: description = c.describe_cluster(ClusterId=self.cluster_id) - state = description['Cluster']['Status']['State'] - if state == 'TERMINATED' or state == 'TERMINATED_WITH_ERRORS': + state = description["Cluster"]["Status"]["State"] + if state == "TERMINATED" or state == "TERMINATED_WITH_ERRORS": stop = True - print('Job is finished') - logger.info('Cluster state:' + state) + print("Job is finished") + logger.info("Cluster state:" + state) time.sleep(30) # Prevent ThrottlingException by limiting number of requests def step_run_setup_scripts(self, c): @@ -425,18 +565,22 @@ def step_run_setup_scripts(self, c): """ response = c.add_job_flow_steps( JobFlowId=self.cluster_id, - Steps=[{ - 'Name': 'run setup', - 'ActionOnFailure': 'CONTINUE', - 'HadoopJarStep': { - 'Jar': 's3://elasticmapreduce/libs/script-runner/script-runner.jar', - 'Args': [ - "s3://{}/setup_master.sh".format(self.package_path_with_bucket), - "s3://{}".format(self.package_path_with_bucket), - ] - } - }] - ) + Steps=[ + { + "Name": "run setup", + "ActionOnFailure": "CONTINUE", + "HadoopJarStep": { + "Jar": "s3://elasticmapreduce/libs/script-runner/script-runner.jar", + "Args": [ + "s3://{}/setup_master.sh".format( + self.package_path_with_bucket + ), + "s3://{}".format(self.package_path_with_bucket), + ], + }, + } + ], + ) logger.info("Added step") time.sleep(1) # Prevent ThrottlingException @@ -449,24 +593,32 @@ def step_spark_submit(self, c, app_file, app_args): response = c.add_job_flow_steps( JobFlowId=self.cluster_id, - Steps=[{ - 'Name': 'Spark Application', - 'ActionOnFailure': 'CONTINUE', - 'HadoopJarStep': { - 'Jar': 'command-runner.jar', - 'Args': cmd_runner_args - } - }] - ) - logger.info("Added step 'spark-submit' with command line '{}'".format(cmd_runner_args)) + Steps=[ + { + "Name": "Spark Application", + "ActionOnFailure": "CONTINUE", + "HadoopJarStep": { + "Jar": "command-runner.jar", + "Args": cmd_runner_args, + }, + } + ], + ) + logger.info( + "Added step 'spark-submit' with command line '{}'".format(cmd_runner_args) + ) time.sleep(1) # Prevent ThrottlingException def get_spark_submit_args(self, app_file, app_args): - emr_mode = 'dev_EMR' if app_args['mode'] == 'dev_local' else app_args['mode'] - launcher_file = app_args.get('launcher_file') or app_file - package = eu.PACKAGES_EMR if self.deploy_args.get('spark_version', '2.4') == '2.4' else eu.PACKAGES_EMR_ALT - package_str = ','.join(package) + emr_mode = "dev_EMR" if app_args["mode"] == "dev_local" else app_args["mode"] + launcher_file = app_args.get("launcher_file") or app_file + package = ( + eu.PACKAGES_EMR + if self.deploy_args.get("spark_version", "2.4") == "2.4" + else eu.PACKAGES_EMR_ALT + ) + package_str = ",".join(package) spark_submit_args = [ "spark-submit", @@ -474,74 +626,130 @@ def get_spark_submit_args(self, app_file, app_args): "--py-files={}scripts.zip".format(eu.CLUSTER_APP_FOLDER), "--packages={}".format(package_str), "--jars={}".format(eu.JARS), - ] - med = ["--driver-memory={}".format(app_args['driver-memory'])] if app_args.get('driver-memory') else [] - cod = ["--driver-cores={}".format(app_args['driver-cores'])] if app_args.get('driver-cores') else [] - mee = ["--executor-memory={}".format(app_args['executor-memory'])] if app_args.get('executor-memory') else [] - coe = ["--executor-cores={}".format(app_args['executor-cores'])] if app_args.get('executor-cores') else [] + ] + med = ( + ["--driver-memory={}".format(app_args["driver-memory"])] + if app_args.get("driver-memory") + else [] + ) + cod = ( + ["--driver-cores={}".format(app_args["driver-cores"])] + if app_args.get("driver-cores") + else [] + ) + mee = ( + ["--executor-memory={}".format(app_args["executor-memory"])] + if app_args.get("executor-memory") + else [] + ) + coe = ( + ["--executor-cores={}".format(app_args["executor-cores"])] + if app_args.get("executor-cores") + else [] + ) spark_app_args = [ - eu.CLUSTER_APP_FOLDER+launcher_file, + eu.CLUSTER_APP_FOLDER + launcher_file, "--mode={}".format(emr_mode), "--deploy=none", "--storage=s3", - "--rerun_criteria={}".format(app_args.get('rerun_criteria')), + "--rerun_criteria={}".format(app_args.get("rerun_criteria")), + ] + jop = ( + [ + "--job_param_file={}".format( + eu.CLUSTER_APP_FOLDER + eu.JOBS_METADATA_FILE + ) ] - jop = ['--job_param_file={}'.format(eu.CLUSTER_APP_FOLDER+eu.JOBS_METADATA_FILE)] if app_args.get('job_param_file') else [] - dep = ["--dependencies"] if app_args.get('dependencies') else [] - box = ["--chain_dependencies"] if app_args.get('chain_dependencies') else [] - sql = [] # ["--sql_file={}".format(eu.CLUSTER_APP_FOLDER+app_args['sql_file'])] if app_args.get('sql_file') else [] # not needed when sql job is run from jobs/generic/launcher.py. TODO: make it work when launching from jobs/generic/sql_job.py - nam = ["--job_name={}".format(app_args['job_name'])] if app_args.get('job_name') else [] + if app_args.get("job_param_file") + else [] + ) + dep = ["--dependencies"] if app_args.get("dependencies") else [] + box = ["--chain_dependencies"] if app_args.get("chain_dependencies") else [] + sql = ( + [] + ) # ["--sql_file={}".format(eu.CLUSTER_APP_FOLDER+app_args['sql_file'])] if app_args.get('sql_file') else [] # not needed when sql job is run from jobs/generic/launcher.py. TODO: make it work when launching from jobs/generic/sql_job.py + nam = ( + ["--job_name={}".format(app_args["job_name"])] + if app_args.get("job_name") + else [] + ) - return spark_submit_args + med + cod + mee + coe + spark_app_args + jop + dep + box + sql + nam + return ( + spark_submit_args + + med + + cod + + mee + + coe + + spark_app_args + + jop + + dep + + box + + sql + + nam + ) def run_aws_data_pipeline(self): self.s3_ops(self.session) - if self.deploy_args.get('push_secrets', False): - self.push_secrets(creds_or_file=self.app_args['connection_file']) # TODO: fix privileges to get creds in dev env + if self.deploy_args.get("push_secrets", False): + self.push_secrets( + creds_or_file=self.app_args["connection_file"] + ) # TODO: fix privileges to get creds in dev env # AWSDataPipeline ops - client = self.session.client('datapipeline') + client = self.session.client("datapipeline") self.deactivate_similar_pipelines(client, self.pipeline_name) pipe_id = self.create_data_pipeline(client) - parameterValues = self.define_data_pipeline(client, pipe_id, self.emr_core_instances) + parameterValues = self.define_data_pipeline( + client, pipe_id, self.emr_core_instances + ) self.activate_data_pipeline(client, pipe_id, parameterValues) def create_data_pipeline(self, client): unique_id = uuid.uuid1() - create = client.create_pipeline(name=self.pipeline_name, uniqueId=str(unique_id)) - logger.debug('Pipeline created :' + str(create)) + create = client.create_pipeline( + name=self.pipeline_name, uniqueId=str(unique_id) + ) + logger.debug("Pipeline created :" + str(create)) - pipe_id = create['pipelineId'] # format: 'df-0624751J5O10SBRYJJF' - logger.info('Created pipeline with id ' + pipe_id) - logger.debug('Pipeline description :' + str(client.describe_pipelines(pipelineIds=[pipe_id]))) + pipe_id = create["pipelineId"] # format: 'df-0624751J5O10SBRYJJF' + logger.info("Created pipeline with id " + pipe_id) + logger.debug( + "Pipeline description :" + + str(client.describe_pipelines(pipelineIds=[pipe_id])) + ) return pipe_id def define_data_pipeline(self, client, pipe_id, emr_core_instances): import awscli.customizations.datapipeline.translator as trans + base = self.get_package_path() if emr_core_instances != 0: - definition_file = base+'yaetos/definition.json' # see syntax in datapipeline-dg.pdf p285 # to add in there: /*"AdditionalMasterSecurityGroups": "#{}", /* To add later to match EMR mode */ + definition_file = ( + base + "yaetos/definition.json" + ) # see syntax in datapipeline-dg.pdf p285 # to add in there: /*"AdditionalMasterSecurityGroups": "#{}", /* To add later to match EMR mode */ else: - definition_file = base+'yaetos/definition_standalone_cluster.json' + definition_file = base + "yaetos/definition_standalone_cluster.json" # TODO: have 1 json for both to avoid having to track duplication. - definition = json.load(open(definition_file, 'r')) # Note: Data Pipeline doesn't support emr-6.0.0 yet. + definition = json.load( + open(definition_file, "r") + ) # Note: Data Pipeline doesn't support emr-6.0.0 yet. pipelineObjects = trans.definition_to_api_objects(definition) parameterObjects = trans.definition_to_api_parameters(definition) parameterValues = trans.definition_to_parameter_values(definition) parameterValues = self.update_params(parameterValues) - logger.info('Filled pipeline with data from '+definition_file) + logger.info("Filled pipeline with data from " + definition_file) response = client.put_pipeline_definition( pipelineId=pipe_id, pipelineObjects=pipelineObjects, parameterObjects=parameterObjects, - parameterValues=parameterValues + parameterValues=parameterValues, ) - logger.info('put_pipeline_definition response: '+str(response)) + logger.info("put_pipeline_definition response: " + str(response)) return parameterValues def activate_data_pipeline(self, client, pipe_id, parameterValues): @@ -550,81 +758,142 @@ def activate_data_pipeline(self, client, pipe_id, parameterValues): parameterValues=parameterValues, # optional. If set, need to specify all params as per json. # startTimestamp=datetime(2018, 12, 1) # optional ) - logger.info('activate_pipeline response: '+str(response)) - logger.info('Activated pipeline ' + pipe_id) + logger.info("activate_pipeline response: " + str(response)) + logger.info("Activated pipeline " + pipe_id) def list_data_pipeline(self, client): - out = client.list_pipelines(marker='') - pipelines = out['pipelineIdList'] - while out['hasMoreResults'] == True: - out = client.list_pipelines(marker=out['marker']) - pipelines += out['pipelineIdList'] + out = client.list_pipelines(marker="") + pipelines = out["pipelineIdList"] + while out["hasMoreResults"] == True: + out = client.list_pipelines(marker=out["marker"]) + pipelines += out["pipelineIdList"] return pipelines def deactivate_similar_pipelines(self, client, pipeline_id): pipelines = self.list_data_pipeline(client) for item in pipelines: - job_name = self.get_job_name(item['name']) - if job_name == self.app_args['job_name']: - response = client.deactivate_pipeline(pipelineId=item['id'], cancelActive=True) - logger.info('Deactivated pipeline {}, {}, {}'.format(job_name, item['name'], item['id'])) + job_name = self.get_job_name(item["name"]) + if job_name == self.app_args["job_name"]: + response = client.deactivate_pipeline( + pipelineId=item["id"], cancelActive=True + ) + logger.info( + "Deactivated pipeline {}, {}, {}".format( + job_name, item["name"], item["id"] + ) + ) def update_params(self, parameterValues): # TODO: check if easier/simpler to change values at the source json instead of a processed one. # Change key pair - myScheduleType = {'EMR_Scheduled': 'cron', 'EMR_DataPipeTest': 'ONDEMAND'}[self.deploy_args.get('deploy')] - myPeriod = self.deploy_args['frequency'] or '1 Day' - if self.deploy_args['start_date'] and isinstance(self.deploy_args['start_date'], datetime): - myStartDateTime = self.deploy_args['start_date'].strftime('%Y-%m-%dT%H:%M:%S') - elif self.deploy_args['start_date'] and isinstance(self.deploy_args['start_date'], str): - myStartDateTime = self.deploy_args['start_date'].format(today=datetime.today().strftime('%Y-%m-%d')) - else : - myStartDateTime = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S') - bootstrap = 's3://{}/setup_nodes.sh'.format(self.package_path_with_bucket) + myScheduleType = {"EMR_Scheduled": "cron", "EMR_DataPipeTest": "ONDEMAND"}[ + self.deploy_args.get("deploy") + ] + myPeriod = self.deploy_args["frequency"] or "1 Day" + if self.deploy_args["start_date"] and isinstance( + self.deploy_args["start_date"], datetime + ): + myStartDateTime = self.deploy_args["start_date"].strftime( + "%Y-%m-%dT%H:%M:%S" + ) + elif self.deploy_args["start_date"] and isinstance( + self.deploy_args["start_date"], str + ): + myStartDateTime = self.deploy_args["start_date"].format( + today=datetime.today().strftime("%Y-%m-%d") + ) + else: + myStartDateTime = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S") + bootstrap = "s3://{}/setup_nodes.sh".format(self.package_path_with_bucket) for ii, item in enumerate(parameterValues): - if 'myEC2KeyPair' in item.values(): - parameterValues[ii] = {'id': u'myEC2KeyPair', 'stringValue': self.ec2_key_name} - elif 'mySubnet' in item.values(): - parameterValues[ii] = {'id': u'mySubnet', 'stringValue': self.ec2_subnet_id} - elif 'myPipelineLogUri' in item.values(): - parameterValues[ii] = {'id': u'myPipelineLogUri', 'stringValue': "s3://{}/{}/scheduled_run_logs/".format(self.s3_bucket_logs, self.metadata_folder)} - elif 'myScheduleType' in item.values(): - parameterValues[ii] = {'id': u'myScheduleType', 'stringValue': myScheduleType} - elif 'myPeriod' in item.values(): - parameterValues[ii] = {'id': u'myPeriod', 'stringValue': myPeriod} - elif 'myStartDateTime' in item.values(): - parameterValues[ii] = {'id': u'myStartDateTime', 'stringValue': myStartDateTime} - elif 'myBootstrapAction' in item.values(): - parameterValues[ii] = {'id': u'myBootstrapAction', 'stringValue': bootstrap} - elif 'myTerminateAfter' in item.values(): - parameterValues[ii] = {'id': u'myTerminateAfter', 'stringValue': self.deploy_args.get('terminate_after', '180 Minutes')} - elif 'myEMRReleaseLabel' in item.values(): - parameterValues[ii] = {'id': u'myEMRReleaseLabel', 'stringValue': self.emr_version} - elif 'myMasterInstanceType' in item.values(): - parameterValues[ii] = {'id': u'myMasterInstanceType', 'stringValue': self.ec2_instance_master} - elif 'myCoreInstanceCount' in item.values(): - parameterValues[ii] = {'id': u'myCoreInstanceCount', 'stringValue': str(self.emr_core_instances)} - elif 'myCoreInstanceType' in item.values(): - parameterValues[ii] = {'id': u'myCoreInstanceType', 'stringValue': self.ec2_instance_slaves} - + if "myEC2KeyPair" in item.values(): + parameterValues[ii] = { + "id": u"myEC2KeyPair", + "stringValue": self.ec2_key_name, + } + elif "mySubnet" in item.values(): + parameterValues[ii] = { + "id": u"mySubnet", + "stringValue": self.ec2_subnet_id, + } + elif "myPipelineLogUri" in item.values(): + parameterValues[ii] = { + "id": u"myPipelineLogUri", + "stringValue": "s3://{}/{}/scheduled_run_logs/".format( + self.s3_bucket_logs, self.metadata_folder + ), + } + elif "myScheduleType" in item.values(): + parameterValues[ii] = { + "id": u"myScheduleType", + "stringValue": myScheduleType, + } + elif "myPeriod" in item.values(): + parameterValues[ii] = {"id": u"myPeriod", "stringValue": myPeriod} + elif "myStartDateTime" in item.values(): + parameterValues[ii] = { + "id": u"myStartDateTime", + "stringValue": myStartDateTime, + } + elif "myBootstrapAction" in item.values(): + parameterValues[ii] = { + "id": u"myBootstrapAction", + "stringValue": bootstrap, + } + elif "myTerminateAfter" in item.values(): + parameterValues[ii] = { + "id": u"myTerminateAfter", + "stringValue": self.deploy_args.get( + "terminate_after", "180 Minutes" + ), + } + elif "myEMRReleaseLabel" in item.values(): + parameterValues[ii] = { + "id": u"myEMRReleaseLabel", + "stringValue": self.emr_version, + } + elif "myMasterInstanceType" in item.values(): + parameterValues[ii] = { + "id": u"myMasterInstanceType", + "stringValue": self.ec2_instance_master, + } + elif "myCoreInstanceCount" in item.values(): + parameterValues[ii] = { + "id": u"myCoreInstanceCount", + "stringValue": str(self.emr_core_instances), + } + elif "myCoreInstanceType" in item.values(): + parameterValues[ii] = { + "id": u"myCoreInstanceType", + "stringValue": self.ec2_instance_slaves, + } # Change steps to include proper path - setup_command = 's3://elasticmapreduce/libs/script-runner/script-runner.jar,s3://{s3_tmp_path}/setup_master.sh,s3://{s3_tmp_path}'.format(s3_tmp_path=self.package_path_with_bucket) # s3://elasticmapreduce/libs/script-runner/script-runner.jar,s3://bucket-tempo/ex1_frameworked_job.arthur_user1.20181129.231423/setup_master.sh,s3://bucket-tempo/ex1_frameworked_job.arthur_user1.20181129.231423/ - spark_submit_command = 'command-runner.jar,' + ','.join([item.replace(',', '\\\,') for item in self.get_spark_submit_args(self.app_file, self.app_args)]) # command-runner.jar,spark-submit,--py-files,/home/hadoop/app/scripts.zip,--packages=com.amazonaws:aws-java-sdk-pom:1.11.760\\\\,org.apache.hadoop:hadoop-aws:2.7.0,/home/hadoop/app/jobs/examples/ex1_frameworked_job.py,--storage=s3 # instructions about \\\ part: https://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-object-emractivity.html + setup_command = "s3://elasticmapreduce/libs/script-runner/script-runner.jar,s3://{s3_tmp_path}/setup_master.sh,s3://{s3_tmp_path}".format( + s3_tmp_path=self.package_path_with_bucket + ) # s3://elasticmapreduce/libs/script-runner/script-runner.jar,s3://bucket-tempo/ex1_frameworked_job.arthur_user1.20181129.231423/setup_master.sh,s3://bucket-tempo/ex1_frameworked_job.arthur_user1.20181129.231423/ + spark_submit_command = "command-runner.jar," + ",".join( + [ + item.replace(",", "\\\,") + for item in self.get_spark_submit_args(self.app_file, self.app_args) + ] + ) # command-runner.jar,spark-submit,--py-files,/home/hadoop/app/scripts.zip,--packages=com.amazonaws:aws-java-sdk-pom:1.11.760\\\\,org.apache.hadoop:hadoop-aws:2.7.0,/home/hadoop/app/jobs/examples/ex1_frameworked_job.py,--storage=s3 # instructions about \\\ part: https://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-object-emractivity.html - commands = [setup_command, spark_submit_command] + commands = [setup_command, spark_submit_command] mm = 0 for ii, item in enumerate(parameterValues): - if 'myEmrStep' in item.values() and mm < 2: # TODO: make more generic and cleaner - parameterValues[ii] = {'id': u'myEmrStep', 'stringValue': commands[mm]} + if ( + "myEmrStep" in item.values() and mm < 2 + ): # TODO: make more generic and cleaner + parameterValues[ii] = {"id": u"myEmrStep", "stringValue": commands[mm]} mm += 1 - logger.info('parameterValues after changes: '+str(parameterValues)) + logger.info("parameterValues after changes: " + str(parameterValues)) return parameterValues def push_secrets(self, creds_or_file): - client = self.session.client('secretsmanager') + client = self.session.client("secretsmanager") file = open(creds_or_file, "r") content = file.read() @@ -635,28 +904,36 @@ def push_secrets(self, creds_or_file): Name=eu.AWS_SECRET_ID, SecretString=content, ) - logger.debug('create_secret response: '+str(response)) - logger.info('Created aws secret, from {}, under secret_id:{}'.format(creds_or_file, eu.AWS_SECRET_ID)) + logger.debug("create_secret response: " + str(response)) + logger.info( + "Created aws secret, from {}, under secret_id:{}".format( + creds_or_file, eu.AWS_SECRET_ID + ) + ) except client.exceptions.ResourceExistsException: response = client.put_secret_value( SecretId=eu.AWS_SECRET_ID, SecretString=content, ) - logger.debug('put_secret_value response: '+str(response)) - logger.info('Updated aws secret, from {}, under secret_id:{}'.format(creds_or_file, eu.AWS_SECRET_ID)) + logger.debug("put_secret_value response: " + str(response)) + logger.info( + "Updated aws secret, from {}, under secret_id:{}".format( + creds_or_file, eu.AWS_SECRET_ID + ) + ) def delete_secrets(self): - """ To be used manually for now to free AWS resources. """ - client = self.session.client('secretsmanager') + """To be used manually for now to free AWS resources.""" + client = self.session.client("secretsmanager") response = client.delete_secret( SecretId=eu.AWS_SECRET_ID, # RecoveryWindowInDays=123, - ForceDeleteWithoutRecovery=True + ForceDeleteWithoutRecovery=True, ) - logger.debug('delete_secret response: '+str(response)) - logger.info('Deleted aws secret, secret_id:'+eu.AWS_SECRET_ID) - print('delete_secret response: {}'.format(response)) + logger.debug("delete_secret response: " + str(response)) + logger.info("Deleted aws secret, secret_id:" + eu.AWS_SECRET_ID) + print("delete_secret response: {}".format(response)) def deploy_all_scheduled(): @@ -664,41 +941,48 @@ def deploy_all_scheduled(): ### pb I don't get when deploying normally, from job files. ### TODO: also need to remove "dependency" run for the ones with no dependencies. def get_yml(args): - meta_file = args.get('job_param_file', 'repo') - if meta_file is 'repo': - meta_file = eu.CLUSTER_APP_FOLDER+eu.JOBS_METADATA_FILE if args['storage']=='s3' else eu.JOBS_METADATA_LOCAL_FILE + meta_file = args.get("job_param_file", "repo") + if meta_file is "repo": + meta_file = ( + eu.CLUSTER_APP_FOLDER + eu.JOBS_METADATA_FILE + if args["storage"] == "s3" + else eu.JOBS_METADATA_LOCAL_FILE + ) yml = eu.Job_Args_Parser.load_meta(meta_file) - logger.info('Loaded job param file: ' + meta_file) + logger.info("Loaded job param file: " + meta_file) return yml def get_bool(prompt): while True: try: - return {"":True, "y":True,"n":False}[input(prompt).lower()] + return {"": True, "y": True, "n": False}[input(prompt).lower()] except KeyError: - print("Invalid input please enter y or n!") + print("Invalid input please enter y or n!") def validate_job(job): return get_bool('Want to schedule "{}" [Y/n]? '.format(job)) # TODO: reuse etl_utils.py Commandliner/set_commandline_args() to have cleaner interface and proper default values. - deploy_args = {'leave_on': False, - 'aws_config_file':eu.AWS_CONFIG_FILE, # TODO: make set-able - 'aws_setup':'dev'} - app_args = {'deploy':'EMR_Scheduled', - 'job_param_file': 'conf/jobs_metadata.yml', # TODO: make set-able. Set to external repo for testing. - 'chain_dependencies': False, - 'dependencies': True, - 'storage': 'local', - 'jobs_folder': eu.JOB_FOLDER, # TODO: make set-able - 'connection_file': eu.CONNECTION_FILE, # TODO: make set-able - } + deploy_args = { + "leave_on": False, + "aws_config_file": eu.AWS_CONFIG_FILE, # TODO: make set-able + "aws_setup": "dev", + } + app_args = { + "deploy": "EMR_Scheduled", + "job_param_file": "conf/jobs_metadata.yml", # TODO: make set-able. Set to external repo for testing. + "chain_dependencies": False, + "dependencies": True, + "storage": "local", + "jobs_folder": eu.JOB_FOLDER, # TODO: make set-able + "connection_file": eu.CONNECTION_FILE, # TODO: make set-able + } yml = get_yml(app_args) pipelines = yml.keys() for pipeline in pipelines: jargs = eu.Job_Args_Parser(app_args) - jargs.set_job_params(job_name=pipeline) # broken TODO: fix. + jargs.set_job_params(job_name=pipeline) # broken TODO: fix. if not jargs.frequency: continue @@ -716,49 +1000,60 @@ def terminate(error_message=None): """ if error_message: logger.error(error_message) - logger.critical('The script is now terminating') + logger.critical("The script is now terminating") exit() + def deploy_standalone(job_args_update={}): # TODO: refactor below to use 'deploy' arg to trigger all deploy features, instead of new 'deploy_option' set below. job_args = { # --- regular job params --- - 'job_param_file': None, - 'mode':'dev_EMR', - 'output': {'path':'n_a', 'type':'csv'}, - 'job_name': 'n_a', + "job_param_file": None, + "mode": "dev_EMR", + "output": {"path": "n_a", "type": "csv"}, + "job_name": "n_a", # --- params specific to running this file directly, can be overriden by command line --- - 'deploy_option':'deploy_code_only', + "deploy_option": "deploy_code_only", } job_args.update(job_args_update) parser, defaults_args = eu.Commandliner.define_commandline_args() cmd_args = eu.Commandliner.set_commandline_args(parser) - jargs = eu.Job_Args_Parser(defaults_args=defaults_args, yml_args=None, job_args=job_args, cmd_args=cmd_args, loaded_inputs={}) + jargs = eu.Job_Args_Parser( + defaults_args=defaults_args, + yml_args=None, + job_args=job_args, + cmd_args=cmd_args, + loaded_inputs={}, + ) deploy_args = jargs.get_deploy_args() - app_args=jargs.get_app_args() + app_args = jargs.get_app_args() - if jargs.deploy_option == 'deploy_job': # can be used to push random code to cluster + if ( + jargs.deploy_option == "deploy_job" + ): # can be used to push random code to cluster # TODO: fails to create a new cluster but works to add a step to an existing cluster. DeployPySparkScriptOnAws(deploy_args, app_args).run() - elif jargs.deploy_option == 'deploy_code_only': - deploy_args['deploy'] = 'code' + elif jargs.deploy_option == "deploy_code_only": + deploy_args["deploy"] = "code" DeployPySparkScriptOnAws(deploy_args, app_args).run() - elif jargs.deploy_option == 'show_list_pipelines': + elif jargs.deploy_option == "show_list_pipelines": deployer = DeployPySparkScriptOnAws(deploy_args, app_args) - client = deployer.session.client('datapipeline') + client = deployer.session.client("datapipeline") pipelines = deployer.list_data_pipeline(client) - print('#--- pipelines: ', pipelines) + print("#--- pipelines: ", pipelines) - elif jargs.deploy_option == 'deploy_all_jobs': - deploy_all_scheduled() # TODO: needs more testing. + elif jargs.deploy_option == "deploy_all_jobs": + deploy_all_scheduled() # TODO: needs more testing. - elif jargs.deploy_option == 'package_code_locally_only': # only for debuging - deployer = DeployPySparkScriptOnAws(deploy_args, app_args) # TODO: should remove need for some of these inputs as they are not required by tar_python_scripts() + elif jargs.deploy_option == "package_code_locally_only": # only for debuging + deployer = DeployPySparkScriptOnAws( + deploy_args, app_args + ) # TODO: should remove need for some of these inputs as they are not required by tar_python_scripts() pipelines = deployer.tar_python_scripts() - print('#--- Finished packaging ---') + print("#--- Finished packaging ---") return True diff --git a/yaetos/etl_utils.py b/yaetos/etl_utils.py index d0350226..cc112a51 100644 --- a/yaetos/etl_utils.py +++ b/yaetos/etl_utils.py @@ -28,8 +28,8 @@ import smtplib, ssl from dateutil.relativedelta import relativedelta import yaetos.spark_utils as su -from yaetos.git_utils import Git_Config_Manager -from yaetos.env_dispatchers import FS_Ops_Dispatcher, Cred_Ops_Dispatcher +from yaetos.git_utils import GitConfigManager +from yaetos.env_dispatchers import FSOpsDispatcher, Cred_Ops_Dispatcher from yaetos.logger import setup_logging logger = setup_logging("Job") @@ -70,7 +70,7 @@ JARS = "https://s3.amazonaws.com/redshift-downloads/drivers/jdbc/1.2.41.1065/RedshiftJDBC42-no-awssdk-1.2.41.1065.jar" # not available in public repo so cannot be put in "packages" var. -class ETL_Base(object): +class ETLBase: TABULAR_TYPES = ("csv", "parquet", "df", "mysql", "clickhouse") SPARK_DF_TYPES = ("csv", "parquet", "df", "mysql", "clickhouse") PANDAS_DF_TYPES = ("csv", "parquet", "df") @@ -88,7 +88,7 @@ def __init__(self, pre_jargs={}, jargs=None, loaded_inputs={}): self.loaded_inputs = loaded_inputs self.jargs = self.set_jargs(pre_jargs, loaded_inputs) if not jargs else jargs if self.jargs.manage_git_info: - git_yml = Git_Config_Manager().get_config( + git_yml = GitConfigManager().get_config( mode=self.jargs.mode, local_app_folder=LOCAL_APP_FOLDER, cluster_app_folder=CLUSTER_APP_FOLDER, @@ -482,14 +482,14 @@ def load_input(self, input_name): # Tabular, Pandas if self.jargs.inputs[input_name].get("df_type") == "pandas": if input_type == "csv": - pdf = FS_Ops_Dispatcher().load_pandas( + pdf = FSOpsDispatcher().load_pandas( path, file_type="csv", read_func="read_csv", read_kwargs=self.jargs.inputs[input_name].get("read_kwargs", {}), ) elif input_type == "parquet": - pdf = FS_Ops_Dispatcher().load_pandas( + pdf = FSOpsDispatcher().load_pandas( path, file_type="parquet", read_func="read_parquet", @@ -779,14 +779,14 @@ def save( # Tabular, Pandas if self.jargs.output.get("df_type") == "pandas": if type == "csv": - FS_Ops_Dispatcher().save_pandas( + FSOpsDispatcher().save_pandas( output, path, save_method="to_csv", save_kwargs=self.jargs.output.get("save_kwargs", {}), ) elif type == "parquet": - FS_Ops_Dispatcher().save_pandas( + FSOpsDispatcher().save_pandas( output, path, save_method="to_parquet", @@ -832,7 +832,7 @@ def save_metadata(self, elapsed): self.jargs.job_name, elapsed, ) - FS_Ops_Dispatcher().save_metadata(fname, content) + FSOpsDispatcher().save_metadata(fname, content) def query(self, query_str): logger.info("Query string:\n" + query_str) @@ -1278,7 +1278,7 @@ def expand_later(self, storage): path = self.path if "{latest}" in path: upstream_path = path.split("{latest}")[0] - paths = FS_Ops_Dispatcher().listdir(upstream_path) + paths = FSOpsDispatcher().listdir(upstream_path) latest_date = max(paths) path = path.format(latest=latest_date) return path diff --git a/yaetos/git_utils.py b/yaetos/git_utils.py index 563f6e95..4cec2654 100644 --- a/yaetos/git_utils.py +++ b/yaetos/git_utils.py @@ -2,47 +2,71 @@ import os import subprocess from yaetos.logger import setup_logging -logger = setup_logging('Job') +logger = setup_logging("Job") -class Git_Config_Manager(): - FNAME = 'conf/git_config.yml' +class GitConfigManager: + + FNAME = "conf/git_config.yml" def get_config(self, mode, **kwargs): - if mode == 'dev_local': - config = self.get_config_from_git(kwargs['local_app_folder']) + if mode == "dev_local": + config = self.get_config_from_git(kwargs["local_app_folder"]) # For debug: self.save_yaml(config) - elif mode in ('dev_EMR', 'prod_EMR'): - config = self.get_config_from_file(kwargs['cluster_app_folder']) + elif mode in ("dev_EMR", "prod_EMR"): + config = self.get_config_from_file(kwargs["cluster_app_folder"]) else: - raise Exception('Wrong mode') + raise Exception("Wrong mode") return config def get_config_from_git(self, local_app_folder): - branch = subprocess.check_output(["git", "describe", '--all']).strip().decode('ascii') # to get if dirty, add '--dirty'. - last_commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip().decode('ascii') - diffs = subprocess.check_output(['git', 'diff', 'HEAD']).strip().decode('ascii') + branch = ( + subprocess.check_output(["git", "describe", "--all"]) + .strip() + .decode("ascii") + ) # to get if dirty, add '--dirty'. + last_commit = ( + subprocess.check_output(["git", "rev-parse", "HEAD"]) + .strip() + .decode("ascii") + ) + diffs = subprocess.check_output(["git", "diff", "HEAD"]).strip().decode("ascii") is_dirty = True if diffs else False - branch_yaetos = subprocess.check_output(['git', 'describe', '--all'], cwd=local_app_folder).strip().decode('ascii') - last_commit_yaetos = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=local_app_folder).strip().decode('ascii') - diffs_yaetos = subprocess.check_output(['git', 'diff', 'HEAD'], cwd=local_app_folder).strip().decode('ascii') + branch_yaetos = ( + subprocess.check_output(["git", "describe", "--all"], cwd=local_app_folder) + .strip() + .decode("ascii") + ) + last_commit_yaetos = ( + subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=local_app_folder) + .strip() + .decode("ascii") + ) + diffs_yaetos = ( + subprocess.check_output(["git", "diff", "HEAD"], cwd=local_app_folder) + .strip() + .decode("ascii") + ) is_dirty_yaetos = True if diffs_yaetos else False - config = {'branch_current':branch, - 'last_commit_current':last_commit, - 'diffs_current':diffs, - 'is_dirty_current':is_dirty, - 'branch_yaetos':branch_yaetos, - 'last_commit_yaetos':last_commit_yaetos, - 'diffs_yaetos':diffs_yaetos, - 'is_dirty_yaetos':is_dirty_yaetos - } + config = { + "branch_current": branch, + "last_commit_current": last_commit, + "diffs_current": diffs, + "is_dirty_current": is_dirty, + "branch_yaetos": branch_yaetos, + "last_commit_yaetos": last_commit_yaetos, + "diffs_yaetos": diffs_yaetos, + "is_dirty_yaetos": is_dirty_yaetos, + } return config def is_git_controlled(self): - out = os.system('git rev-parse') # not using subprocess.check_output() to avoid crash if it fails. + out = os.system( + "git rev-parse" + ) # not using subprocess.check_output() to avoid crash if it fails. if out == 0: return True else: @@ -50,15 +74,15 @@ def is_git_controlled(self): def save_yaml(self, config): os.makedirs(os.path.dirname(self.FNAME), exist_ok=True) - with open(self.FNAME, 'w') as file: + with open(self.FNAME, "w") as file: ignored = yaml.dump(config, file) - logger.info('Saved yml with git info: {}'.format(self.FNAME)) + logger.info("Saved yml with git info: {}".format(self.FNAME)) def get_config_from_file(self, cluster_app_folder): """Meant to work in EMR""" - fname = cluster_app_folder+self.FNAME + fname = cluster_app_folder + self.FNAME if os.path.isfile(fname): - with open(fname, 'r') as stream: + with open(fname, "r") as stream: yml = yaml.load(stream) return yml else: diff --git a/yaetos/kafka_utils.py b/yaetos/kafka_utils.py index 21d661da..2b757a2a 100644 --- a/yaetos/kafka_utils.py +++ b/yaetos/kafka_utils.py @@ -9,7 +9,7 @@ logger = setup_logging('Kafka_push') -class KafkaProducer(object): +class KafkaProducer: def __init__(self, broker_address, topic, send_timeout, check_schema=False, schema_uri=None, connect_kafka=True): # TODO: add schema validation to only do message validation in the later stage (send()) to avoid validating schema for every record. diff --git a/yaetos/libs/generic_jobs/copy_job.py b/yaetos/libs/generic_jobs/copy_job.py index d9f1dda9..1285a155 100644 --- a/yaetos/libs/generic_jobs/copy_job.py +++ b/yaetos/libs/generic_jobs/copy_job.py @@ -1,8 +1,7 @@ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner -class Job(ETL_Base): - +class Job(ETLBase): def transform(self, table_to_copy): table_to_copy.cache() if table_to_copy.count() < 500000: @@ -11,5 +10,5 @@ def transform(self, table_to_copy): if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/yaetos/libs/generic_jobs/dummy_job.py b/yaetos/libs/generic_jobs/dummy_job.py index 2af5183f..3e593ff1 100644 --- a/yaetos/libs/generic_jobs/dummy_job.py +++ b/yaetos/libs/generic_jobs/dummy_job.py @@ -1,11 +1,12 @@ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner from pyspark.sql.types import StructType -class Job(ETL_Base): + +class Job(ETLBase): def transform(self): return self.sc_sql.createDataFrame([], StructType([])) if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/yaetos/logger.py b/yaetos/logger.py index f810a6af..0512856a 100644 --- a/yaetos/logger.py +++ b/yaetos/logger.py @@ -6,9 +6,9 @@ def setup_logging(logger_name, default_level=logging.INFO): Setup logging configuration Use in scripts: logger = setup_logging('Job') """ - format = "|%(asctime)s:%(levelname)s:%(name)s|%(message)s" # Default: format = "%(levelname)s:%(name)s:%(message)s" + format = "|%(asctime)s:%(levelname)s:%(name)s|%(message)s" logging.basicConfig(level=default_level, format=format, datefmt="%H:%M:%S") - logging.getLogger('Deploy').setLevel(logging.INFO) - logging.getLogger('botocore.credentials').setLevel(logging.CRITICAL) - logging.getLogger('boto3.resources.action').setLevel(logging.CRITICAL) + logging.getLogger("Deploy").setLevel(logging.INFO) + logging.getLogger("botocore.credentials").setLevel(logging.CRITICAL) + logging.getLogger("boto3.resources.action").setLevel(logging.CRITICAL) return logging.getLogger(logger_name) diff --git a/yaetos/mysql_job.py b/yaetos/mysql_job.py index 6b33cdf3..5c412725 100644 --- a/yaetos/mysql_job.py +++ b/yaetos/mysql_job.py @@ -1,19 +1,20 @@ -from yaetos.etl_utils import ETL_Base, Commandliner, Cred_Ops_Dispatcher +from yaetos.etl_utils import ETLBase, Commandliner, Cred_Ops_Dispatcher from yaetos.db_utils import pdf_to_sdf from libs.python_db_connectors.query_mysql import query as query_mysql -class Mysql_Job(ETL_Base): - +class Mysql_Job(ETLBase): def query_mysql(self, query_str): - """ Requires OUTPUT_TYPES to be specified in class. + """Requires OUTPUT_TYPES to be specified in class. and mysql connection to be put in yml file in line like "api_inputs: {'api_creds': 'mysql_creds_section'}" output spark df.""" - query_str = query_str.replace('%', '%%') # replace necessary for pymysql parser - self.logger.info('Query string:\n' + query_str) - creds = Cred_Ops_Dispatcher().retrieve_secrets(self.jargs.storage, creds=self.jargs.connection_file) - creds_section = self.jargs.yml_args['api_inputs']['api_creds'] - self.logger.info('The query is using the credential section:' + creds_section) + query_str = query_str.replace("%", "%%") # replace necessary for pymysql parser + self.logger.info("Query string:\n" + query_str) + creds = Cred_Ops_Dispatcher().retrieve_secrets( + self.jargs.storage, creds=self.jargs.connection_file + ) + creds_section = self.jargs.yml_args["api_inputs"]["api_creds"] + self.logger.info("The query is using the credential section:" + creds_section) pdf = query_mysql(query_str, db=creds_section, creds_or_file=creds) sdf = pdf_to_sdf(pdf, self.OUTPUT_TYPES, self.sc, self.sc_sql) return sdf diff --git a/yaetos/oracle_sql_job.py b/yaetos/oracle_sql_job.py index 0c63658c..318b79f7 100644 --- a/yaetos/oracle_sql_job.py +++ b/yaetos/oracle_sql_job.py @@ -1,10 +1,10 @@ -from yaetos.etl_utils import ETL_Base, Commandliner, Cred_Ops_Dispatcher +from yaetos.etl_utils import ETLBase, Commandliner, Cred_Ops_Dispatcher from yaetos.db_utils import pdf_to_sdf from libs.python_db_connectors.query_oracle import query as query_oracle from sqlalchemy import types -class Job(ETL_Base): +class Job(ETLBase): """To run/deploy sql jobs, requires --sql_file arg.""" def set_job_file(self): diff --git a/yaetos/scripts/copy/ex0_extraction_job.py b/yaetos/scripts/copy/ex0_extraction_job.py index 5f9d297e..13998f8d 100644 --- a/yaetos/scripts/copy/ex0_extraction_job.py +++ b/yaetos/scripts/copy/ex0_extraction_job.py @@ -1,22 +1,24 @@ """ Demo basic extraction job using a public datasource (from wikimedia) """ -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner import requests import os import pandas as pd -class Job(ETL_Base): +class Job(ETLBase): def transform(self): - url = self.jargs.api_inputs['path'] + url = self.jargs.api_inputs["path"] resp = requests.get(url, allow_redirects=True) - self.logger.info('Finished reading file from {}.'.format(url)) + self.logger.info("Finished reading file from {}.".format(url)) # Save to local - tmp_dir = 'tmp' - os.makedirs(tmp_dir, exist_ok = True) - local_path = tmp_dir+'/tmp_file.csv.gz' - open(local_path, 'wb').write(resp.content) # creating local copy, necessary for sc_sql.read.csv, TODO: check to remove local copy step. - self.logger.info('Copied file locally at {}.'.format(local_path)) + tmp_dir = "tmp" + os.makedirs(tmp_dir, exist_ok=True) + local_path = tmp_dir + "/tmp_file.csv.gz" + open(local_path, "wb").write( + resp.content + ) # creating local copy, necessary for sc_sql.read.csv, TODO: check to remove local copy step. + self.logger.info("Copied file locally at {}.".format(local_path)) # Save as dataframe pdf = pd.read_csv(local_path) @@ -25,5 +27,5 @@ def transform(self): if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/yaetos/scripts/copy/ex1_frameworked_job.py b/yaetos/scripts/copy/ex1_frameworked_job.py index 1f76d8ca..e3a06903 100644 --- a/yaetos/scripts/copy/ex1_frameworked_job.py +++ b/yaetos/scripts/copy/ex1_frameworked_job.py @@ -1,20 +1,22 @@ """Same as ex1_full_sql_job.sql but allows access to spark for more complex ops (not used here but in ex2_frameworked_job.py).""" -from yaetos.etl_utils import ETL_Base, Commandliner +from yaetos.etl_utils import ETLBase, Commandliner -class Job(ETL_Base): +class Job(ETLBase): def transform(self, some_events, other_events): - df = self.query(""" + df = self.query( + """ SELECT se.session_id, count(*) as count_events FROM some_events se JOIN other_events oe on se.session_id=oe.session_id WHERE se.action='searchResultPage' and se.n_results>0 group by se.session_id order by count(*) desc - """) + """ + ) return df if __name__ == "__main__": - args = {'job_param_file': 'conf/jobs_metadata.yml'} + args = {"job_param_file": "conf/jobs_metadata.yml"} Commandliner(Job, **args) diff --git a/yaetos/scripts/copy/ex1_frameworked_job_test.py b/yaetos/scripts/copy/ex1_frameworked_job_test.py index 75f36ba6..88301e71 100644 --- a/yaetos/scripts/copy/ex1_frameworked_job_test.py +++ b/yaetos/scripts/copy/ex1_frameworked_job_test.py @@ -2,26 +2,39 @@ from jobs.examples.ex1_frameworked_job import Job -class Test_Job(object): +class TestJob: def test_transform(self, sc, sc_sql, ss, get_pre_jargs): - some_events = ss.read.json(sc.parallelize([ - {'session_id': 1234, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1234, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1235, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1236, 'action': 'other', 'n_results': 1}, - ])) + some_events = ss.read.json( + sc.parallelize( + [ + {"session_id": 1234, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1234, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1235, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1236, "action": "other", "n_results": 1}, + ] + ) + ) - other_events = ss.read.json(sc.parallelize([ - {'session_id': 1234, 'other': 1}, - {'session_id': 1235, 'other': 1}, - {'session_id': 1236, 'other': 1}, - ])) + other_events = ss.read.json( + sc.parallelize( + [ + {"session_id": 1234, "other": 1}, + {"session_id": 1235, "other": 1}, + {"session_id": 1236, "other": 1}, + ] + ) + ) expected = [ - {'session_id': 1234, 'count_events': 2}, - {'session_id': 1235, 'count_events': 1}, - ] + {"session_id": 1234, "count_events": 2}, + {"session_id": 1235, "count_events": 1}, + ] - loaded_inputs={'some_events': some_events, 'other_events':other_events} - actual = Job(pre_jargs=get_pre_jargs(loaded_inputs.keys())).etl_no_io(sc, sc_sql, loaded_inputs=loaded_inputs)[0].toPandas().to_dict(orient='records') + loaded_inputs = {"some_events": some_events, "other_events": other_events} + actual = ( + Job(pre_jargs=get_pre_jargs(loaded_inputs.keys())) + .etl_no_io(sc, sc_sql, loaded_inputs=loaded_inputs)[0] + .toPandas() + .to_dict(orient="records") + ) assert actual == expected diff --git a/yaetos/scripts/copy/ex1_full_sql_job_test.py b/yaetos/scripts/copy/ex1_full_sql_job_test.py index d15d3397..e5f46bc9 100644 --- a/yaetos/scripts/copy/ex1_full_sql_job_test.py +++ b/yaetos/scripts/copy/ex1_full_sql_job_test.py @@ -2,30 +2,43 @@ from yaetos.sql_job import Job -class Test_Job(object): +class TestJob: def test_transform(self, sc, sc_sql, ss, get_pre_jargs): - some_events = ss.read.json(sc.parallelize([ - {'session_id': 1234, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1234, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1235, 'action': 'searchResultPage', 'n_results': 1}, - {'session_id': 1236, 'action': 'other', 'n_results': 1}, - ])) + some_events = ss.read.json( + sc.parallelize( + [ + {"session_id": 1234, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1234, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1235, "action": "searchResultPage", "n_results": 1}, + {"session_id": 1236, "action": "other", "n_results": 1}, + ] + ) + ) - other_events = ss.read.json(sc.parallelize([ - {'session_id': 1234, 'other': 1}, - {'session_id': 1235, 'other': 1}, - {'session_id': 1236, 'other': 1}, - ])) + other_events = ss.read.json( + sc.parallelize( + [ + {"session_id": 1234, "other": 1}, + {"session_id": 1235, "other": 1}, + {"session_id": 1236, "other": 1}, + ] + ) + ) expected = [ - {'session_id': 1234, 'count_events': 2}, - {'session_id': 1235, 'count_events': 1}, - ] + {"session_id": 1234, "count_events": 2}, + {"session_id": 1235, "count_events": 1}, + ] - sql_file = 'jobs/examples/ex1_full_sql_job.sql' + sql_file = "jobs/examples/ex1_full_sql_job.sql" - loaded_inputs={'some_events': some_events, 'other_events':other_events} + loaded_inputs = {"some_events": some_events, "other_events": other_events} pre_jargs = get_pre_jargs(loaded_inputs.keys()) - pre_jargs['cmd_args']['sql_file'] = sql_file - actual = Job(pre_jargs=pre_jargs).etl_no_io(sc, sc_sql, loaded_inputs=loaded_inputs)[0].toPandas().to_dict(orient='records') + pre_jargs["cmd_args"]["sql_file"] = sql_file + actual = ( + Job(pre_jargs=pre_jargs) + .etl_no_io(sc, sc_sql, loaded_inputs=loaded_inputs)[0] + .toPandas() + .to_dict(orient="records") + ) assert actual == expected diff --git a/yaetos/scripts/install_env.py b/yaetos/scripts/install_env.py index 4cc452c2..5b8ee416 100644 --- a/yaetos/scripts/install_env.py +++ b/yaetos/scripts/install_env.py @@ -18,56 +18,60 @@ # TODO: replace duplicated files in yaetos/script folder to hard symlinks to avoid duplication. -class YaetosCmds(object): + +class YaetosCmds: # Source: https://chase-seibert.github.io/blog/2014/03/21/python-multilevel-argparse.html usage_setup = "Setup yaetos folders and files in current folder." usage_launch_env = "Launching docker container to run jobs." - usage = f''' + usage = f""" yaetos [] Yaetos top level commands are: setup {usage_setup} launch_env {usage_launch_env} - ''' + """ def __init__(self): parser = argparse.ArgumentParser( - description='Yeatos command lines', - usage=self.usage) - parser.add_argument('command', help='Subcommand to run') + description="Yeatos command lines", usage=self.usage + ) + parser.add_argument("command", help="Subcommand to run") args = parser.parse_args(sys.argv[1:2]) if not hasattr(self, args.command): - print('Unrecognized command') + print("Unrecognized command") parser.print_help() exit(1) - getattr(self, args.command)() # dispatching according to command line. + getattr(self, args.command)() # dispatching according to command line. def setup(self): - parser = argparse.ArgumentParser( - description=self.usage_setup) - parser.add_argument('--set_github', action='store_true') - args = parser.parse_args(sys.argv[2:]) # ignoring first 2 args (i.e. "yeatos setup") + parser = argparse.ArgumentParser(description=self.usage_setup) + parser.add_argument("--set_github", action="store_true") + args = parser.parse_args( + sys.argv[2:] + ) # ignoring first 2 args (i.e. "yeatos setup") setup_env(args) def launch_env(self): - parser = argparse.ArgumentParser( - description=self.usage_launch_env) + parser = argparse.ArgumentParser(description=self.usage_launch_env) # parser.add_argument('--no_aws', action='store_true') - args = parser.parse_args(sys.argv[2:]) # ignoring first 2 args (i.e. "yeatos launch_env") + args = parser.parse_args( + sys.argv[2:] + ) # ignoring first 2 args (i.e. "yeatos launch_env") launch_env() - def setup_env(args): cwd = os.getcwd() - print(f'Will setup yaetos in the current folder ({cwd})') + print(f"Will setup yaetos in the current folder ({cwd})") paths = yaetos.__path__ package_path = paths[0] - if len(paths) > 1 : - print(f'Yeatos python package found in several locations. The script will use this one: {package_path}') + if len(paths) > 1: + print( + f"Yeatos python package found in several locations. The script will use this one: {package_path}" + ) # Empty folders necessary for later. os.system("mkdir -p tmp/files_to_ship/") @@ -75,46 +79,93 @@ def setup_env(args): # TODO: make code above and below compatible with windows OS (no cmd line, no linux only paths). # Root folder files - copyfile(f'{package_path}/scripts/copy/Dockerfile_external', f'{cwd}/Dockerfile') - copyfile(f'{package_path}/scripts/copy/launch_env_external.sh', f'{cwd}/launch_env.sh') - os.chmod(f'{cwd}/launch_env.sh', 0o755) # TODO: use stat.S_IEXEC instead to be cross plateform + copyfile(f"{package_path}/scripts/copy/Dockerfile_external", f"{cwd}/Dockerfile") + copyfile( + f"{package_path}/scripts/copy/launch_env_external.sh", f"{cwd}/launch_env.sh" + ) + os.chmod( + f"{cwd}/launch_env.sh", 0o755 + ) # TODO: use stat.S_IEXEC instead to be cross plateform # Conf os.system("mkdir -p conf/") - copyfile(f'{package_path}/scripts/copy/aws_config.cfg.example', f'{cwd}/conf/aws_config.cfg') - copyfile(f'{package_path}/scripts/copy/jobs_metadata_external.yml', f'{cwd}/conf/jobs_metadata.yml') - copyfile(f'{package_path}/scripts/copy/connections.cfg.example', f'{cwd}/conf/connections.cfg') - copyfile(f'{package_path}/scripts/copy/requirements_extra.txt', f'{cwd}/conf/requirements_extra.txt') + copyfile( + f"{package_path}/scripts/copy/aws_config.cfg.example", + f"{cwd}/conf/aws_config.cfg", + ) + copyfile( + f"{package_path}/scripts/copy/jobs_metadata_external.yml", + f"{cwd}/conf/jobs_metadata.yml", + ) + copyfile( + f"{package_path}/scripts/copy/connections.cfg.example", + f"{cwd}/conf/connections.cfg", + ) + copyfile( + f"{package_path}/scripts/copy/requirements_extra.txt", + f"{cwd}/conf/requirements_extra.txt", + ) # Sample jobs os.system("mkdir -p jobs/generic/") - copyfile(f'{package_path}/libs/generic_jobs/copy_job.py', f'{cwd}/jobs/generic/copy_job.py') - copyfile(f'{package_path}/libs/generic_jobs/deployer.py', f'{cwd}/jobs/generic/deployer.py') - copyfile(f'{package_path}/libs/generic_jobs/dummy_job.py', f'{cwd}/jobs/generic/dummy_job.py') - copyfile(f'{package_path}/libs/generic_jobs/launcher.py', f'{cwd}/jobs/generic/launcher.py') + copyfile( + f"{package_path}/libs/generic_jobs/copy_job.py", + f"{cwd}/jobs/generic/copy_job.py", + ) + copyfile( + f"{package_path}/libs/generic_jobs/deployer.py", + f"{cwd}/jobs/generic/deployer.py", + ) + copyfile( + f"{package_path}/libs/generic_jobs/dummy_job.py", + f"{cwd}/jobs/generic/dummy_job.py", + ) + copyfile( + f"{package_path}/libs/generic_jobs/launcher.py", + f"{cwd}/jobs/generic/launcher.py", + ) # Sample jobs os.system("mkdir -p jobs/examples/") - copyfile(f'{package_path}/scripts/copy/ex0_extraction_job.py', f'{cwd}/jobs/examples/ex0_extraction_job.py') - copyfile(f'{package_path}/scripts/copy/ex1_frameworked_job.py', f'{cwd}/jobs/examples/ex1_frameworked_job.py') - copyfile(f'{package_path}/scripts/copy/ex1_full_sql_job.sql', f'{cwd}/jobs/examples/ex1_full_sql_job.sql') + copyfile( + f"{package_path}/scripts/copy/ex0_extraction_job.py", + f"{cwd}/jobs/examples/ex0_extraction_job.py", + ) + copyfile( + f"{package_path}/scripts/copy/ex1_frameworked_job.py", + f"{cwd}/jobs/examples/ex1_frameworked_job.py", + ) + copyfile( + f"{package_path}/scripts/copy/ex1_full_sql_job.sql", + f"{cwd}/jobs/examples/ex1_full_sql_job.sql", + ) # Sample jobs tests os.system("mkdir -p tests/jobs/example/") - copyfile(f'{package_path}/scripts/copy/conftest.py', f'{cwd}/tests/conftest.py') - copyfile(f'{package_path}/scripts/copy/ex1_frameworked_job_test.py', f'{cwd}/tests/jobs/examples/ex1_frameworked_job_test.py') - copyfile(f'{package_path}/scripts/copy/ex1_full_sql_job_test.py', f'{cwd}/tests/jobs/examples/ex1_full_sql_job_test.py') + copyfile(f"{package_path}/scripts/copy/conftest.py", f"{cwd}/tests/conftest.py") + copyfile( + f"{package_path}/scripts/copy/ex1_frameworked_job_test.py", + f"{cwd}/tests/jobs/examples/ex1_frameworked_job_test.py", + ) + copyfile( + f"{package_path}/scripts/copy/ex1_full_sql_job_test.py", + f"{cwd}/tests/jobs/examples/ex1_full_sql_job_test.py", + ) # TODO: add setup awscli or make sure it is there. # setup github CI if args.set_github: os.system("mkdir -p .github/workflows/") - copyfile(f'{package_path}/scripts/github_pythonapp.yml', f'{cwd}/.github/workflows/pythonapp.yml') + copyfile( + f"{package_path}/scripts/github_pythonapp.yml", + f"{cwd}/.github/workflows/pythonapp.yml", + ) - print('Done') + print("Done") def launch_env(): import subprocess + subprocess.call("./launch_env.sh") diff --git a/yaetos/sql_job.py b/yaetos/sql_job.py index ec108f7c..6c1bb01c 100644 --- a/yaetos/sql_job.py +++ b/yaetos/sql_job.py @@ -1,26 +1,32 @@ -from yaetos.etl_utils import ETL_Base, Commandliner, Job_Args_Parser, Job_Yml_Parser +from yaetos.etl_utils import ETLBase, Commandliner, Job_Args_Parser, Job_Yml_Parser -class Job(ETL_Base): +class Job(ETLBase): """To run/deploy sql jobs, using --sql_file arg.""" def set_jargs(self, pre_jargs, loaded_inputs={}): # Function called only if running the job directly, i.e. "python yaetos/sql_job.py --sql_file=jobs/some_job.sql", ignored if running from "python jobs/generic/launcher.py --job_name=some_job.sql" - sql_file=pre_jargs['cmd_args']['sql_file'] + sql_file = pre_jargs["cmd_args"]["sql_file"] job_name = Job_Yml_Parser.set_job_name_from_file(sql_file) - pre_jargs['job_args']['job_name'] = job_name - return Job_Args_Parser(defaults_args=pre_jargs['defaults_args'], yml_args=None, job_args=pre_jargs['job_args'], cmd_args=pre_jargs['cmd_args'], loaded_inputs=loaded_inputs) + pre_jargs["job_args"]["job_name"] = job_name + return Job_Args_Parser( + defaults_args=pre_jargs["defaults_args"], + yml_args=None, + job_args=pre_jargs["job_args"], + cmd_args=pre_jargs["cmd_args"], + loaded_inputs=loaded_inputs, + ) def transform(self, **ignored): sql = self.read_sql_file(self.jargs.sql_file) df = self.query(sql) - if self.jargs.merged_args.get('repartition'): - df = df.repartition(self.jargs.merged_args['repartition']) + if self.jargs.merged_args.get("repartition"): + df = df.repartition(self.jargs.merged_args["repartition"]) return df @staticmethod def read_sql_file(fname): - fh = open(fname, 'r') + fh = open(fname, "r") sql = fh.read() fh.close() return sql From 432a97f711279d755c930c21ac1e5fe649d3ed3f Mon Sep 17 00:00:00 2001 From: AlejandroUPC Date: Tue, 31 May 2022 03:21:12 +0200 Subject: [PATCH 10/10] :wrench: forgot to commit file --- yaetos/env_dispatchers.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/yaetos/env_dispatchers.py b/yaetos/env_dispatchers.py index 924f52fd..e4a3f98e 100644 --- a/yaetos/env_dispatchers.py +++ b/yaetos/env_dispatchers.py @@ -47,10 +47,9 @@ def save_metadata(self, fname, content): ) else self.save_metadata_local(fname, content) @staticmethod - def save_metadata_local(fname, content): - fh = open(fname, "w") - fh.write(content) - fh.close() + def save_metadata_local(fname: str, content) -> None: + with open(fname, "w") as fh: + fh.write(content) logger.info("Created file locally: {}".format(fname)) @staticmethod