From 70afd6dbb7abb93907a3e2f61ee674c7af97fd2f Mon Sep 17 00:00:00 2001 From: William Lo Date: Thu, 10 Oct 2019 20:54:49 -0700 Subject: [PATCH] [GOBBLIN-895] Fixes Gobblin Standalone configs and scripts so that the user guide is accurate Closes #2751 from Will-Lo/fix-gobblin-standalone- script --- bin/gobblin-admin.sh | 2 +- bin/gobblin-aws.sh | 2 +- bin/gobblin-cluster-master.sh | 2 +- bin/gobblin-cluster-worker.sh | 2 +- bin/gobblin-mapreduce.sh | 2 +- bin/gobblin-service.sh | 2 +- bin/gobblin-standalone.sh | 2 +- bin/gobblin-yarn.sh | 2 +- bin/gobblin.sh | 20 ++++++++- bin/gobblin_password_encryptor.sh | 2 +- bin/historystore-manager.sh | 2 +- bin/statestore-checker.sh | 2 +- bin/statestore-cleaner.sh | 2 +- conf/standalone/application.conf | 69 +++++++++++-------------------- conf/standalone/log4j.xml | 32 ++++++++++++++ gobblin-docs/Getting-Started.md | 4 +- 16 files changed, 89 insertions(+), 60 deletions(-) create mode 100644 conf/standalone/log4j.xml diff --git a/bin/gobblin-admin.sh b/bin/gobblin-admin.sh index 1a510db4e1b..1e95d0ed2a0 100755 --- a/bin/gobblin-admin.sh +++ b/bin/gobblin-admin.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# @depricated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh +# @deprecated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh CURRENT_DIR="$(cd `dirname $0`/..; pwd)" $CURRENT_DIR/bin/gobblin cli $@ \ No newline at end of file diff --git a/bin/gobblin-aws.sh b/bin/gobblin-aws.sh index 48fd15fc7bd..dab9a95fb56 100755 --- a/bin/gobblin-aws.sh +++ b/bin/gobblin-aws.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# @depricated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh +# @deprecated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh CURRENT_DIR="$(cd `dirname $0`/..; pwd)" $CURRENT_DIR/bin/gobblin service aws $@ \ No newline at end of file diff --git a/bin/gobblin-cluster-master.sh b/bin/gobblin-cluster-master.sh index cc39264d4d9..0632d495a83 100755 --- a/bin/gobblin-cluster-master.sh +++ b/bin/gobblin-cluster-master.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# @depricated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh +# @deprecated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh CURRENT_DIR="$(cd `dirname $0`/..; pwd)" $CURRENT_DIR/bin/gobblin service cluster-master $@ \ No newline at end of file diff --git a/bin/gobblin-cluster-worker.sh b/bin/gobblin-cluster-worker.sh index ec99c831488..8e607d5a28b 100755 --- a/bin/gobblin-cluster-worker.sh +++ b/bin/gobblin-cluster-worker.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# @depricated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh +# @deprecated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh CURRENT_DIR="$(cd `dirname $0`/..; pwd)" $CURRENT_DIR/bin/gobblin service cluster-worker $@ diff --git a/bin/gobblin-mapreduce.sh b/bin/gobblin-mapreduce.sh index 5e050fa20d3..9ee0cc8393c 100755 --- a/bin/gobblin-mapreduce.sh +++ b/bin/gobblin-mapreduce.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# @depricated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh +# @deprecated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh ############################################################## ############### Run Gobblin Jobs on Hadoop MR ################ diff --git a/bin/gobblin-service.sh b/bin/gobblin-service.sh index cafb15c37f0..6c080a2c43c 100755 --- a/bin/gobblin-service.sh +++ b/bin/gobblin-service.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# @depricated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh +# @deprecated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh CURRENT_DIR="$(cd `dirname $0`/..; pwd)" $CURRENT_DIR/bin/gobblin service service-manager $@ \ No newline at end of file diff --git a/bin/gobblin-standalone.sh b/bin/gobblin-standalone.sh index 356d3c45067..9210004576f 100755 --- a/bin/gobblin-standalone.sh +++ b/bin/gobblin-standalone.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# @depricated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh +# @deprecated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh CURRENT_DIR="$(cd `dirname $0`/..; pwd)" $CURRENT_DIR/bin/gobblin service standalone $@ \ No newline at end of file diff --git a/bin/gobblin-yarn.sh b/bin/gobblin-yarn.sh index e5316515d0b..6e5c1389d7a 100755 --- a/bin/gobblin-yarn.sh +++ b/bin/gobblin-yarn.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# @depricated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh +# @deprecated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh CURRENT_DIR="$(cd `dirname $0`/..; pwd)" $CURRENT_DIR/bin/gobblin service yarn $@ \ No newline at end of file diff --git a/bin/gobblin.sh b/bin/gobblin.sh index d8d97fda069..7e32a90ee05 100755 --- a/bin/gobblin.sh +++ b/bin/gobblin.sh @@ -238,6 +238,11 @@ if [[ "$GOBBLIN_MODE_TYPE" == "$CLI" ]]; then fi fi +CHECK_ENV_VARS=false +if [ $ACTION == "start" ] || [ $ACTION == "restart" ]; then + CHECK_ENV_VARS=true +fi + # derived based on input from user, $GOBBLIN_MODE PID_FILE_NAME=".gobblin-$GOBBLIN_MODE.pid" PID_FILE="$GOBBLIN_HOME/$PID_FILE_NAME" @@ -263,6 +268,10 @@ if [[ -n "$USER_LOG4J_FILE" ]]; then elif [[ -f ${GOBBLIN_CONF}/log4j2.xml ]]; then LOG4J_FILE_PATH=file://${GOBBLIN_CONF}/log4j2.xml LOG4J_OPTS="-Dlog4j.configuration=$LOG4J_FILE_PATH" +#prefer log4j.xml +elif [[ -f ${GOBBLIN_CONF}/log4j.xml ]]; then + LOG4J_FILE_PATH=file://${GOBBLIN_CONF}/log4j.xml + LOG4J_OPTS="-Dlog4j.configuration=$LOG4J_FILE_PATH" #defaults to log4j.properties elif [[ -f ${GOBBLIN_CONF}/log4j.properties ]]; then LOG4J_FILE_PATH=file://${GOBBLIN_CONF}/log4j.properties @@ -372,6 +381,7 @@ function start() { LOG_OUT_FILE="${GOBBLIN_LOGS}/${GOBBLIN_MODE}.out" LOG_ERR_FILE="${GOBBLIN_LOGS}/${GOBBLIN_MODE}.err" + ADDITIONAL_ARGS="" # for all gobblin commands if [[ "$GOBBLIN_MODE_TYPE" == "$CLI" ]]; then @@ -417,7 +427,15 @@ function start() { CLASS_N_ARGS='' if [[ "$GOBBLIN_MODE" = "$STANDALONE_MODE" ]]; then CLASS_N_ARGS="$STANDALONE_CLASS $GOBBLIN_CONF/application.conf" + ADDITIONAL_ARGS="-Dgobblin.logs.dir=${GOBBLIN_LOGS}" + + if [ -z "$GOBBLIN_WORK_DIR" ] && [ "$CHECK_ENV_VARS" == true ]; then + die "GOBBLIN_WORK_DIR is not set!" + fi + if [ -z "$GOBBLIN_JOB_CONFIG_DIR" ] && [ "$CHECK_ENV_VARS" == true ]; then + die "Environment variable GOBBLIN_JOB_CONFIG_DIR not set!" + fi elif [[ "$GOBBLIN_MODE" = "$AWS_MODE" ]]; then CLASS_N_ARGS="$AWS_CLASS" @@ -442,7 +460,7 @@ function start() { echo "Invalid gobblin command or execution mode... [EXITING]" exit 1 fi - GOBBLIN_COMMAND="$JAVA_HOME/bin/java -cp $GOBBLIN_CLASSPATH $GC_OPTS $JVM_OPTS $LOG4J_OPTS $CLASS_N_ARGS" + GOBBLIN_COMMAND="$JAVA_HOME/bin/java -cp $GOBBLIN_CLASSPATH $GC_OPTS $JVM_OPTS $LOG4J_OPTS $ADDITIONAL_ARGS $CLASS_N_ARGS" fi # execute the command diff --git a/bin/gobblin_password_encryptor.sh b/bin/gobblin_password_encryptor.sh index 0fdc2116ab1..9dec72aaa3e 100755 --- a/bin/gobblin_password_encryptor.sh +++ b/bin/gobblin_password_encryptor.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# @depricated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh +# @deprecated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh script_dir=$(dirname $0) lib_dir=${script_dir}/../lib diff --git a/bin/historystore-manager.sh b/bin/historystore-manager.sh index c2fbc259913..0e06711825c 100755 --- a/bin/historystore-manager.sh +++ b/bin/historystore-manager.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# @depricated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh +# @deprecated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh CURRENT_DIR="$(cd `dirname $0`/..; pwd)" $CURRENT_DIR/bin/gobblin cli job-store-schema-manager $@ \ No newline at end of file diff --git a/bin/statestore-checker.sh b/bin/statestore-checker.sh index ff9661e4b2e..60c0279759c 100755 --- a/bin/statestore-checker.sh +++ b/bin/statestore-checker.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# @depricated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh +# @deprecated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh CURRENT_DIR="$(cd `dirname $0`/..; pwd)" $CURRENT_DIR/bin/gobblin cli job-state-to-json $@ diff --git a/bin/statestore-cleaner.sh b/bin/statestore-cleaner.sh index 2999220144f..f4eacbefea0 100755 --- a/bin/statestore-cleaner.sh +++ b/bin/statestore-cleaner.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# @depricated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh +# @deprecated: This script is kept for backward compatibility only and will be removed in future. Use gobblin.sh FWDIR="$(cd `dirname $0`/..; pwd)" diff --git a/conf/standalone/application.conf b/conf/standalone/application.conf index 3a856e33087..77e118280b3 100644 --- a/conf/standalone/application.conf +++ b/conf/standalone/application.conf @@ -15,70 +15,44 @@ # limitations under the License. # -# Cluster configuration properties -gobblin.cluster.app.name=GobblinStandaloneCluster -gobblin.cluster.email.notification.on.shutdown=false -gobblin.cluster.helix.instance.max.retries=2 -gobblin.cluster.work.dir=/tmp/gobblin-cluster - -# Helix/Zookeeper configuration properties -gobblin.cluster.helix.cluster.name=GobblinStandaloneCluster -gobblin.cluster.zk.connection.string="localhost:2181" - -# job config monitor interval -jobconf.monitor.interval=30000 - -# Sample configuration properties for the Gobblin Standalone cluster -gobblin.cluster.workDir=${gobblin.cluster.work.dir}/GobblinStandaloneCluster - -# default is the JobConfigurationManager -# use this manager to accept jobs from Kafka. It requires some additional Kafka related parameters. -#gobblin.cluster.job.configuration.manager=org.apache.gobblin.cluster.StreamingJobConfigurationManager -#spec.kafka.topics=ruyang_test_kafka_gobblin -#kafka.brokers="hostname:12913/kafka-queuing" -#jobSpecMonitor.kafka.zookeeper.connect="hostname:12913/kafka-queuing" - -# Cluster configuration properties -gobblin.cluster.helix.cluster.name=GobblinStandaloneClusterCli - -# used by the JobConfigurationManager -gobblin.cluster.job.conf.path=${gobblin.cluster.work.dir}/jobs -gobblin.cluster.jobconf.fullyQualifiedPath=${gobblin.cluster.work.dir}/jobs -gobblin.cluster.job.catalog=org.apache.gobblin.runtime.job_catalog.FSJobCatalog +# Thread pool settings for the task executor +taskexecutor.threadpool.size=2 +taskretry.threadpool.coresize=1 +taskretry.threadpool.maxsize=2 # File system URIs -fs.uri="file:///" +fs.uri=file:/// writer.fs.uri=${fs.uri} state.store.fs.uri=${fs.uri} # Writer related configuration properties -writer.destination.type=HDFS writer.output.format=AVRO -writer.staging.dir=${gobblin.cluster.work.dir}/task-staging -writer.output.dir=${gobblin.cluster.work.dir}/task-output +writer.staging.dir=${env:GOBBLIN_WORK_DIR}/task-staging +writer.output.dir=${env:GOBBLIN_WORK_DIR}/task-output # Data publisher related configuration properties data.publisher.type=org.apache.gobblin.publisher.BaseDataPublisher -data.publisher.final.dir=${gobblin.cluster.work.dir}/job-output +data.publisher.final.dir=${env:GOBBLIN_WORK_DIR}/job-output data.publisher.replace.final.dir=false +# Directory where job configuration files are stored +jobconf.dir=${env:GOBBLIN_JOB_CONFIG_DIR} +jobconf.fullyQualifiedPath=file://${env:GOBBLIN_JOB_CONFIG_DIR} + # Directory where job/task state files are stored -state.store.dir=${gobblin.cluster.work.dir}/state-store +state.store.dir=${env:GOBBLIN_WORK_DIR}/state-store -# Directory where error files from the quality checkers are stored -qualitychecker.row.err.file=${gobblin.cluster.work.dir}/err +# Directory where commit sequences are stored +gobblin.runtime.commit.sequence.store.dir=${env:GOBBLIN_WORK_DIR}/commit-sequence-store -# Disable job locking for now -job.lock.enabled=false +# Directory where error files from the quality checkers are stored +qualitychecker.row.err.file=${env:GOBBLIN_WORK_DIR}/err # Directory where job locks are stored -job.lock.dir=${gobblin.cluster.work.dir}/locks +job.lock.dir=${env:GOBBLIN_WORK_DIR}/locks # Directory where metrics log files are stored -metrics.log.dir=${gobblin.cluster.work.dir}/metrics - -# Interval of task state reporting in milliseconds -task.status.reportintervalinms=1000 +metrics.log.dir=${env:GOBBLIN_WORK_DIR}/metrics # Enable metrics / events metrics.enabled=true @@ -94,3 +68,8 @@ rest.server.port=9090 # job history store ( WARN [GobblinYarnAppLauncher] NOT starting the admin UI because the job execution info server is NOT enabled ) job.execinfo.server.enabled=false job.history.store.enabled=false +task.status.reportintervalinms=5000 + +# The time gap for Job Detector to detect modification/deletion/creation of jobconfig. +# Unit in milliseconds, configurable. +jobconf.monitor.interval=30000 diff --git a/conf/standalone/log4j.xml b/conf/standalone/log4j.xml new file mode 100644 index 00000000000..436a2b02721 --- /dev/null +++ b/conf/standalone/log4j.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/gobblin-docs/Getting-Started.md b/gobblin-docs/Getting-Started.md index 5f5d8434a38..29ac9864302 100644 --- a/gobblin-docs/Getting-Started.md +++ b/gobblin-docs/Getting-Started.md @@ -89,7 +89,7 @@ Each Gobblin job minimally involves several constructs, e.g. [Source](https://gi Some of the classes relevant to this example include [WikipediaSource](https://github.com/apache/incubator-gobblin/blob/master/gobblin-example/src/main/java/org/apache/gobblin/example/wikipedia/WikipediaSource.java), [WikipediaExtractor](https://github.com/apache/incubator-gobblin/blob/master/gobblin-example/src/main/java/org/apache/gobblin/example/wikipedia/WikipediaExtractor.java), [WikipediaConverter](https://github.com/apache/incubator-gobblin/blob/master/gobblin-example/src/main/java/org/apache/gobblin/example/wikipedia/WikipediaConverter.java), [AvroHdfsDataWriter](https://github.com/apache/incubator-gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/writer/AvroHdfsDataWriter.java) and [BaseDataPublisher](https://github.com/apache/incubator-gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/publisher/BaseDataPublisher.java). -To run Gobblin in standalone daemon mode we need a Gobblin configuration file (such as uses [gobblin-standalone.properties](https://github.com/apache/incubator-gobblin/blob/master/conf/gobblin-standalone-v2.properties)). And for each job we wish to run, we also need a job configuration file (such as [wikipedia.pull](https://github.com/apache/incubator-gobblin/blob/master/gobblin-example/src/main/resources/wikipedia.pull)). The Gobblin configuration file, which is passed to Gobblin as a command line argument, should contain a property `jobconf.dir` which specifies where the job configuration files are located. By default, `jobconf.dir` points to environment variable `GOBBLIN_JOB_CONFIG_DIR`. Each file in `jobconf.dir` with extension `.job` or `.pull` is considered a job configuration file, and Gobblin will launch a job for each such file. For more information on Gobblin deployment in standalone mode, refer to the [Standalone Deployment](user-guide/Gobblin-Deployment#Standalone-Deployment) page. +To run Gobblin in standalone daemon mode we need a Gobblin configuration file (such as uses [application.conf](https://github.com/apache/incubator-gobblin/blob/master/conf/standalone/application.conf)). And for each job we wish to run, we also need a job configuration file (such as [wikipedia.pull](https://github.com/apache/incubator-gobblin/blob/master/gobblin-example/src/main/resources/wikipedia.pull)). The Gobblin configuration file, which is passed to Gobblin as a command line argument, should contain a property `jobconf.dir` which specifies where the job configuration files are located. By default, `jobconf.dir` points to environment variable `GOBBLIN_JOB_CONFIG_DIR`. Each file in `jobconf.dir` with extension `.job` or `.pull` is considered a job configuration file, and Gobblin will launch a job for each such file. For more information on Gobblin deployment in standalone mode, refer to the [Standalone Deployment](user-guide/Gobblin-Deployment#Standalone-Deployment) page. A list of commonly used configuration properties can be found here: [Configuration Properties Glossary](user-guide/Configuration-Properties-Glossary). @@ -107,7 +107,7 @@ A list of commonly used configuration properties can be found here: [Configurati gobblin service standalone start ``` -The job log, which contains the progress and status of the job, will be written into `logs/.out` & `logs/.err` (to change where the log is written, modify the Log4j configuration file `conf/log4j.properties`). +Stdout and the job log, which contains the progress and status of the job, will be written into `logs/.out` & `logs/.err` (to change where the log is written, modify the Log4j configuration file `conf/log4j.xml`). Among the job logs there should be the following information: