From b85508056844c85eefa8ed783fb93447b5fcd121 Mon Sep 17 00:00:00 2001 From: ZackYoung Date: Fri, 17 Nov 2023 22:06:46 +0800 Subject: [PATCH] [Optimize] You can submit Yarn-Application (Linux) on Windows (#2541) * fix_app * fix_web_some_bug * fix_app * fix_app * fix_app * fix_app_some_bug * fix_app_some_bug * fix_app_some_bug --- dinky-admin/pom.xml | 5 - .../dinky/controller/DownloadController.java | 15 +- .../main/java/org/dinky/init/SystemInit.java | 4 +- .../resource/impl/OssResourceManager.java | 2 +- dinky-app/dinky-app-1.14/pom.xml | 12 - .../src/main/java/org/dinky/app/MainApp.java | 4 - dinky-app/dinky-app-1.15/pom.xml | 12 - .../src/main/java/org/dinky/app/MainApp.java | 4 - dinky-app/dinky-app-1.16/pom.xml | 5 +- .../src/main/java/org/dinky/app/MainApp.java | 4 - dinky-app/dinky-app-1.17/pom.xml | 5 +- .../src/main/java/org/dinky/app/MainApp.java | 4 - dinky-app/dinky-app-1.18/pom.xml | 5 +- .../src/main/java/org/dinky/app/MainApp.java | 4 - dinky-app/dinky-app-base/pom.xml | 44 + .../main/java/org/dinky/app/db/DBUtil.java | 13 +- .../org/dinky/app/flinksql/Submitter.java | 106 +- .../java/org/dinky/app/model/SysConfig.java | 35 + .../app/resource/BaseResourceManager.java | 52 + .../resource/impl/HdfsResourceManager.java | 63 + .../app/resource/impl/OssResourceManager.java | 48 + .../org/dinky/app/url/RsURLConnection.java | 50 + .../org/dinky/app/url/RsURLStreamHandler.java | 32 + .../app/url/RsURLStreamHandlerFactory.java | 51 + .../java/org/dinky/app/util/FlinkAppUtil.java | 83 +- dinky-client/dinky-client-1.14/pom.xml | 4 - .../java/org/apache/flink/yarn/Utils.java | 658 ++++++ .../flink/yarn/YarnClusterDescriptor.java | 1634 +++++++++++++++ .../executor/CustomTableEnvironmentImpl.java | 43 +- dinky-client/dinky-client-1.15/pom.xml | 4 - .../java/org/apache/flink/yarn/Utils.java | 658 ++++++ .../flink/yarn/YarnClusterDescriptor.java | 1694 ++++++++++++++++ .../executor/CustomTableEnvironmentImpl.java | 12 +- .../java/org/apache/flink/yarn/Utils.java | 692 +++++++ .../flink/yarn/YarnClusterDescriptor.java | 1737 ++++++++++++++++ .../executor/CustomTableEnvironmentImpl.java | 24 +- .../java/org/apache/flink/yarn/Utils.java | 578 ++++++ .../flink/yarn/YarnClusterDescriptor.java | 1753 ++++++++++++++++ .../executor/CustomTableEnvironmentImpl.java | 15 - .../java/org/apache/flink/yarn/Utils.java | 623 ++++++ .../flink/yarn/YarnClusterDescriptor.java | 1769 +++++++++++++++++ .../executor/CustomTableEnvironmentImpl.java | 15 - .../executor/CustomTableEnvironment.java | 19 +- .../dinky/trans/dml/ExecuteJarOperation.java | 8 +- dinky-common/pom.xml | 5 + .../main/java/org/dinky/data/app/AppTask.java | 5 + .../dinky/data/properties/OssProperties.java | 0 .../main/java/org/dinky/oss}/OssTemplate.java | 2 +- .../main/java/org/dinky/job/JobManager.java | 20 +- dinky-flink/dinky-flink-1.14/pom.xml | 22 - dinky-flink/dinky-flink-1.15/pom.xml | 22 - dinky-flink/dinky-flink-1.16/pom.xml | 22 +- dinky-flink/dinky-flink-1.17/pom.xml | 20 - dinky-flink/dinky-flink-1.18/pom.xml | 20 - .../dinky/gateway/config/ClusterConfig.java | 8 +- .../gateway/yarn/YarnApplicationGateway.java | 1 - .../org/dinky/gateway/yarn/YarnGateway.java | 14 +- dinky-web/src/pages/Metrics/Job/index.tsx | 512 ++--- .../ConfigurationForm/YarnConfig/index.tsx | 30 +- 59 files changed, 12702 insertions(+), 603 deletions(-) create mode 100644 dinky-app/dinky-app-base/src/main/java/org/dinky/app/model/SysConfig.java create mode 100644 dinky-app/dinky-app-base/src/main/java/org/dinky/app/resource/BaseResourceManager.java create mode 100644 dinky-app/dinky-app-base/src/main/java/org/dinky/app/resource/impl/HdfsResourceManager.java create mode 100644 dinky-app/dinky-app-base/src/main/java/org/dinky/app/resource/impl/OssResourceManager.java create mode 100644 dinky-app/dinky-app-base/src/main/java/org/dinky/app/url/RsURLConnection.java create mode 100644 dinky-app/dinky-app-base/src/main/java/org/dinky/app/url/RsURLStreamHandler.java create mode 100644 dinky-app/dinky-app-base/src/main/java/org/dinky/app/url/RsURLStreamHandlerFactory.java create mode 100644 dinky-client/dinky-client-1.14/src/main/java/org/apache/flink/yarn/Utils.java create mode 100644 dinky-client/dinky-client-1.14/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java create mode 100644 dinky-client/dinky-client-1.15/src/main/java/org/apache/flink/yarn/Utils.java create mode 100644 dinky-client/dinky-client-1.15/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java create mode 100644 dinky-client/dinky-client-1.16/src/main/java/org/apache/flink/yarn/Utils.java create mode 100644 dinky-client/dinky-client-1.16/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java create mode 100644 dinky-client/dinky-client-1.17/src/main/java/org/apache/flink/yarn/Utils.java create mode 100644 dinky-client/dinky-client-1.17/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java create mode 100644 dinky-client/dinky-client-1.18/src/main/java/org/apache/flink/yarn/Utils.java create mode 100644 dinky-client/dinky-client-1.18/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java rename {dinky-admin => dinky-common}/src/main/java/org/dinky/data/properties/OssProperties.java (100%) rename {dinky-admin/src/main/java/org/dinky/utils => dinky-common/src/main/java/org/dinky/oss}/OssTemplate.java (99%) diff --git a/dinky-admin/pom.xml b/dinky-admin/pom.xml index e67a392de2..7de6bf8477 100644 --- a/dinky-admin/pom.xml +++ b/dinky-admin/pom.xml @@ -44,11 +44,6 @@ org.jasypt jasypt - - com.amazonaws - aws-java-sdk-s3 - 1.12.481 - org.yaml snakeyaml diff --git a/dinky-admin/src/main/java/org/dinky/controller/DownloadController.java b/dinky-admin/src/main/java/org/dinky/controller/DownloadController.java index 1e94b040f6..31da1d9ad4 100644 --- a/dinky-admin/src/main/java/org/dinky/controller/DownloadController.java +++ b/dinky-admin/src/main/java/org/dinky/controller/DownloadController.java @@ -26,6 +26,7 @@ import org.dinky.data.model.FlinkUdfManifest; import org.dinky.function.constant.PathConstant; import org.dinky.function.util.ZipWriter; +import org.dinky.service.resource.BaseResourceManager; import java.io.File; import java.io.InputStream; @@ -50,7 +51,9 @@ import io.swagger.annotations.ApiOperation; import lombok.extern.slf4j.Slf4j; -/** @since 0.7.0 */ +/** + * @since 0.7.0 + */ @Slf4j @RestController @Api(tags = "UDF & App Jar Controller") @@ -102,7 +105,7 @@ public void downloadJavaUDF(@PathVariable Integer taskId, HttpServletResponse re * 提供docker通过http下载dinky-app.jar * * @param version 版本 - * @param resp resp + * @param resp resp */ @GetMapping("downloadAppJar/{version}") @Log(title = "Download App Jar", businessType = BusinessType.DOWNLOAD) @@ -114,4 +117,12 @@ public void downloadAppJar(@PathVariable String version, HttpServletResponse res ServletUtil.write(resp, files.get(0)); } } + + @GetMapping("downloadFromRs") + @Log(title = "Download From Resource", businessType = BusinessType.DOWNLOAD) + @ApiOperation("Download From Resource") + public void downloadJavaUDF(String path, HttpServletResponse resp) { + InputStream inputStream = BaseResourceManager.getInstance().readFile(path); + ServletUtil.write(resp, inputStream); + } } diff --git a/dinky-admin/src/main/java/org/dinky/init/SystemInit.java b/dinky-admin/src/main/java/org/dinky/init/SystemInit.java index 367a9b0ce0..36e3130dd9 100644 --- a/dinky-admin/src/main/java/org/dinky/init/SystemInit.java +++ b/dinky-admin/src/main/java/org/dinky/init/SystemInit.java @@ -34,6 +34,7 @@ import org.dinky.function.constant.PathConstant; import org.dinky.function.pool.UdfCodePool; import org.dinky.job.FlinkJobTask; +import org.dinky.oss.OssTemplate; import org.dinky.scheduler.client.ProjectClient; import org.dinky.scheduler.exception.SchedulerException; import org.dinky.scheduler.model.Project; @@ -46,7 +47,6 @@ import org.dinky.service.resource.impl.OssResourceManager; import org.dinky.url.RsURLStreamHandlerFactory; import org.dinky.utils.JsonUtils; -import org.dinky.utils.OssTemplate; import org.dinky.utils.UDFUtils; import org.apache.catalina.webresources.TomcatURLStreamHandlerFactory; @@ -98,9 +98,9 @@ public class SystemInit implements ApplicationRunner { public void run(ApplicationArguments args) { TenantContextHolder.ignoreTenant(); initResources(); - List tenants = tenantService.list(); sysConfigService.initSysConfig(); + for (Tenant tenant : tenants) { taskService.initDefaultFlinkSQLEnv(tenant.getId()); } diff --git a/dinky-admin/src/main/java/org/dinky/service/resource/impl/OssResourceManager.java b/dinky-admin/src/main/java/org/dinky/service/resource/impl/OssResourceManager.java index ca964956b7..1b70a7a992 100644 --- a/dinky-admin/src/main/java/org/dinky/service/resource/impl/OssResourceManager.java +++ b/dinky-admin/src/main/java/org/dinky/service/resource/impl/OssResourceManager.java @@ -21,8 +21,8 @@ import org.dinky.data.exception.BusException; import org.dinky.data.exception.DinkyException; +import org.dinky.oss.OssTemplate; import org.dinky.service.resource.BaseResourceManager; -import org.dinky.utils.OssTemplate; import java.io.File; import java.io.InputStream; diff --git a/dinky-app/dinky-app-1.14/pom.xml b/dinky-app/dinky-app-1.14/pom.xml index d8e57cbeaf..7203f9783d 100644 --- a/dinky-app/dinky-app-1.14/pom.xml +++ b/dinky-app/dinky-app-1.14/pom.xml @@ -40,10 +40,6 @@ dinky-app-base ${project.version} - - mysql - mysql-connector-java - org.dinky dinky-client-1.14 @@ -53,14 +49,6 @@ dinky-flink-1.14 ${scope.runtime} - - org.dinky - dinky-client-base - - - org.dinky - dinky-executor - diff --git a/dinky-app/dinky-app-1.14/src/main/java/org/dinky/app/MainApp.java b/dinky-app/dinky-app-1.14/src/main/java/org/dinky/app/MainApp.java index 5f8e2bef14..fbcc98e3e5 100644 --- a/dinky-app/dinky-app-1.14/src/main/java/org/dinky/app/MainApp.java +++ b/dinky-app/dinky-app-1.14/src/main/java/org/dinky/app/MainApp.java @@ -22,7 +22,6 @@ import org.dinky.app.constant.AppParamConstant; import org.dinky.app.db.DBUtil; import org.dinky.app.flinksql.Submitter; -import org.dinky.app.util.FlinkAppUtil; import org.dinky.data.app.AppParamConfig; import org.dinky.utils.JsonUtils; @@ -56,9 +55,6 @@ public static void main(String[] args) throws Exception { } catch (Exception e) { log.error("exectue app failed with config: {}", appConfig); throw e; - } finally { - log.info("Start Monitor Job"); - FlinkAppUtil.monitorFlinkTask(appConfig.getTaskId()); } } } diff --git a/dinky-app/dinky-app-1.15/pom.xml b/dinky-app/dinky-app-1.15/pom.xml index 74b3608f81..0b3c9ee27d 100644 --- a/dinky-app/dinky-app-1.15/pom.xml +++ b/dinky-app/dinky-app-1.15/pom.xml @@ -39,10 +39,6 @@ org.dinky dinky-app-base - - mysql - mysql-connector-java - org.dinky dinky-client-1.15 @@ -52,14 +48,6 @@ dinky-flink-1.15 ${scope.runtime} - - org.dinky - dinky-client-base - - - org.dinky - dinky-executor - diff --git a/dinky-app/dinky-app-1.15/src/main/java/org/dinky/app/MainApp.java b/dinky-app/dinky-app-1.15/src/main/java/org/dinky/app/MainApp.java index 5f8e2bef14..fbcc98e3e5 100644 --- a/dinky-app/dinky-app-1.15/src/main/java/org/dinky/app/MainApp.java +++ b/dinky-app/dinky-app-1.15/src/main/java/org/dinky/app/MainApp.java @@ -22,7 +22,6 @@ import org.dinky.app.constant.AppParamConstant; import org.dinky.app.db.DBUtil; import org.dinky.app.flinksql.Submitter; -import org.dinky.app.util.FlinkAppUtil; import org.dinky.data.app.AppParamConfig; import org.dinky.utils.JsonUtils; @@ -56,9 +55,6 @@ public static void main(String[] args) throws Exception { } catch (Exception e) { log.error("exectue app failed with config: {}", appConfig); throw e; - } finally { - log.info("Start Monitor Job"); - FlinkAppUtil.monitorFlinkTask(appConfig.getTaskId()); } } } diff --git a/dinky-app/dinky-app-1.16/pom.xml b/dinky-app/dinky-app-1.16/pom.xml index 519331a60f..7c5a83c133 100644 --- a/dinky-app/dinky-app-1.16/pom.xml +++ b/dinky-app/dinky-app-1.16/pom.xml @@ -25,12 +25,11 @@ org.dinky - dinky-client-${dinky.flink.version} - ${scope.runtime} + dinky-client-1.16 org.dinky - dinky-flink-${dinky.flink.version} + dinky-flink-1.16 ${scope.runtime} diff --git a/dinky-app/dinky-app-1.16/src/main/java/org/dinky/app/MainApp.java b/dinky-app/dinky-app-1.16/src/main/java/org/dinky/app/MainApp.java index 6578fad658..9effc900f8 100644 --- a/dinky-app/dinky-app-1.16/src/main/java/org/dinky/app/MainApp.java +++ b/dinky-app/dinky-app-1.16/src/main/java/org/dinky/app/MainApp.java @@ -22,7 +22,6 @@ import org.dinky.app.constant.AppParamConstant; import org.dinky.app.db.DBUtil; import org.dinky.app.flinksql.Submitter; -import org.dinky.app.util.FlinkAppUtil; import org.dinky.data.app.AppParamConfig; import org.dinky.utils.JsonUtils; @@ -55,9 +54,6 @@ public static void main(String[] args) throws Exception { Submitter.submit(appConfig); } catch (Exception e) { log.error("exectue app failed : ", e); - } finally { - log.info("Start Monitor Job"); - FlinkAppUtil.monitorFlinkTask(appConfig.getTaskId()); } } } diff --git a/dinky-app/dinky-app-1.17/pom.xml b/dinky-app/dinky-app-1.17/pom.xml index ccd829741a..7d1a452df3 100644 --- a/dinky-app/dinky-app-1.17/pom.xml +++ b/dinky-app/dinky-app-1.17/pom.xml @@ -25,12 +25,11 @@ org.dinky - dinky-client-${dinky.flink.version} - ${scope.runtime} + dinky-client-1.17 org.dinky - dinky-flink-${dinky.flink.version} + dinky-flink-1.17 ${scope.runtime} diff --git a/dinky-app/dinky-app-1.17/src/main/java/org/dinky/app/MainApp.java b/dinky-app/dinky-app-1.17/src/main/java/org/dinky/app/MainApp.java index 5f8e2bef14..fbcc98e3e5 100644 --- a/dinky-app/dinky-app-1.17/src/main/java/org/dinky/app/MainApp.java +++ b/dinky-app/dinky-app-1.17/src/main/java/org/dinky/app/MainApp.java @@ -22,7 +22,6 @@ import org.dinky.app.constant.AppParamConstant; import org.dinky.app.db.DBUtil; import org.dinky.app.flinksql.Submitter; -import org.dinky.app.util.FlinkAppUtil; import org.dinky.data.app.AppParamConfig; import org.dinky.utils.JsonUtils; @@ -56,9 +55,6 @@ public static void main(String[] args) throws Exception { } catch (Exception e) { log.error("exectue app failed with config: {}", appConfig); throw e; - } finally { - log.info("Start Monitor Job"); - FlinkAppUtil.monitorFlinkTask(appConfig.getTaskId()); } } } diff --git a/dinky-app/dinky-app-1.18/pom.xml b/dinky-app/dinky-app-1.18/pom.xml index cb8a86f4b0..d9f6606eae 100644 --- a/dinky-app/dinky-app-1.18/pom.xml +++ b/dinky-app/dinky-app-1.18/pom.xml @@ -25,12 +25,11 @@ org.dinky - dinky-client-${dinky.flink.version} - ${scope.runtime} + dinky-client-1.18 org.dinky - dinky-flink-${dinky.flink.version} + dinky-flink-1.18 ${scope.runtime} diff --git a/dinky-app/dinky-app-1.18/src/main/java/org/dinky/app/MainApp.java b/dinky-app/dinky-app-1.18/src/main/java/org/dinky/app/MainApp.java index 5f8e2bef14..fbcc98e3e5 100644 --- a/dinky-app/dinky-app-1.18/src/main/java/org/dinky/app/MainApp.java +++ b/dinky-app/dinky-app-1.18/src/main/java/org/dinky/app/MainApp.java @@ -22,7 +22,6 @@ import org.dinky.app.constant.AppParamConstant; import org.dinky.app.db.DBUtil; import org.dinky.app.flinksql.Submitter; -import org.dinky.app.util.FlinkAppUtil; import org.dinky.data.app.AppParamConfig; import org.dinky.utils.JsonUtils; @@ -56,9 +55,6 @@ public static void main(String[] args) throws Exception { } catch (Exception e) { log.error("exectue app failed with config: {}", appConfig); throw e; - } finally { - log.info("Start Monitor Job"); - FlinkAppUtil.monitorFlinkTask(appConfig.getTaskId()); } } } diff --git a/dinky-app/dinky-app-base/pom.xml b/dinky-app/dinky-app-base/pom.xml index 20b6dd9be4..907b550bdc 100644 --- a/dinky-app/dinky-app-base/pom.xml +++ b/dinky-app/dinky-app-base/pom.xml @@ -31,6 +31,12 @@ Dinky : App : Base + + org.apache.hadoop + hadoop-common + 3.1.0 + provided + mysql mysql-connector-java @@ -38,6 +44,44 @@ org.dinky dinky-executor + + + com.github.xiaoymin + knife4j-openapi2-spring-boot-starter + + + com.github.docker-java + docker-java-core + + + com.github.docker-java + docker-java-transport-httpclient5 + + + org.jeasy + easy-rules-core + + + org.jeasy + easy-rules-spel + + + org.eclipse.jgit + org.eclipse.jgit.archive + + + org.eclipse.jgit + org.eclipse.jgit.ssh.jsch + + + com.github.oshi + oshi-core + + + org.dinky + dinky-gateway + + org.projectlombok diff --git a/dinky-app/dinky-app-base/src/main/java/org/dinky/app/db/DBUtil.java b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/db/DBUtil.java index 45a83fe4b3..1f62983d69 100644 --- a/dinky-app/dinky-app-base/src/main/java/org/dinky/app/db/DBUtil.java +++ b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/db/DBUtil.java @@ -19,6 +19,7 @@ package org.dinky.app.db; +import org.dinky.app.model.SysConfig; import org.dinky.data.app.AppDatabase; import org.dinky.data.app.AppParamConfig; import org.dinky.data.app.AppTask; @@ -68,14 +69,8 @@ public static String getDbSourceSQLStatement() throws SQLException { return sb.toString(); } - public static String getSysConfig(String key) throws SQLException { - Entity option = Entity.create("dinky_sys_config").set("name", key); - List entities = db.find(option); - if (entities.size() <= 0) { - throw new IllegalArgumentException( - StrFormatter.format("The system conifg is not found: {}, please check! ", key)); - } else { - return entities.get(0).getStr("value"); - } + public static List getSysConfigList() throws SQLException { + Entity option = Entity.create("dinky_sys_config"); + return db.find(option, SysConfig.class); } } diff --git a/dinky-app/dinky-app-base/src/main/java/org/dinky/app/flinksql/Submitter.java b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/flinksql/Submitter.java index 63bb35abe9..16e799ddde 100644 --- a/dinky-app/dinky-app-base/src/main/java/org/dinky/app/flinksql/Submitter.java +++ b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/flinksql/Submitter.java @@ -19,19 +19,33 @@ package org.dinky.app.flinksql; +import static org.apache.hadoop.fs.FileSystem.getDefaultUri; + import org.dinky.app.db.DBUtil; import org.dinky.app.model.StatementParam; +import org.dinky.app.model.SysConfig; +import org.dinky.app.resource.impl.HdfsResourceManager; +import org.dinky.app.resource.impl.OssResourceManager; +import org.dinky.app.url.RsURLStreamHandlerFactory; +import org.dinky.app.util.FlinkAppUtil; import org.dinky.assertion.Asserts; +import org.dinky.config.Dialect; import org.dinky.constant.FlinkSQLConstant; import org.dinky.data.app.AppParamConfig; import org.dinky.data.app.AppTask; -import org.dinky.data.enums.Status; +import org.dinky.data.exception.DinkyException; +import org.dinky.data.model.SystemConfiguration; +import org.dinky.data.properties.OssProperties; import org.dinky.executor.Executor; import org.dinky.executor.ExecutorConfig; import org.dinky.executor.ExecutorFactory; import org.dinky.interceptor.FlinkInterceptor; +import org.dinky.oss.OssTemplate; import org.dinky.parser.SqlType; import org.dinky.trans.Operations; +import org.dinky.trans.dml.ExecuteJarOperation; +import org.dinky.trans.parse.AddJarSqlParseStrategy; +import org.dinky.trans.parse.ExecuteJarParseStrategy; import org.dinky.utils.SqlUtil; import org.dinky.utils.ZipUtils; @@ -39,6 +53,8 @@ import org.apache.commons.lang3.StringUtils; import org.apache.flink.configuration.PipelineOptions; import org.apache.flink.python.PythonOptions; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; import java.io.File; import java.io.FileOutputStream; @@ -53,15 +69,20 @@ import java.time.LocalDateTime; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.stream.Collectors; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import cn.hutool.core.collection.CollUtil; import cn.hutool.core.io.FileUtil; +import cn.hutool.core.lang.Singleton; import cn.hutool.core.util.ArrayUtil; import cn.hutool.core.util.URLUtil; +import lombok.SneakyThrows; /** * FlinkSQLFactory @@ -71,7 +92,52 @@ public class Submitter { private static final Logger log = LoggerFactory.getLogger(Submitter.class); + private static void initSystemConfiguration() throws SQLException { + SystemConfiguration systemConfiguration = SystemConfiguration.getInstances(); + List sysConfigList = DBUtil.getSysConfigList(); + Map configMap = + CollUtil.toMap(sysConfigList, new HashMap<>(), SysConfig::getName, SysConfig::getValue); + systemConfiguration.initSetConfiguration(configMap); + } + + private static void initResource() throws SQLException { + SystemConfiguration systemConfiguration = SystemConfiguration.getInstances(); + switch (systemConfiguration.getResourcesModel().getValue()) { + case OSS: + OssProperties ossProperties = new OssProperties(); + ossProperties.setAccessKey( + systemConfiguration.getResourcesOssAccessKey().getValue()); + ossProperties.setSecretKey( + systemConfiguration.getResourcesOssSecretKey().getValue()); + ossProperties.setEndpoint( + systemConfiguration.getResourcesOssEndpoint().getValue()); + ossProperties.setBucketName( + systemConfiguration.getResourcesOssBucketName().getValue()); + ossProperties.setRegion( + systemConfiguration.getResourcesOssRegion().getValue()); + Singleton.get(OssResourceManager.class).setOssTemplate(new OssTemplate(ossProperties)); + break; + case HDFS: + final Configuration configuration = new Configuration(); + configuration.set( + "fs.defaultFS", + systemConfiguration.getResourcesHdfsDefaultFS().getValue()); + try { + FileSystem fileSystem = FileSystem.get( + getDefaultUri(configuration), + configuration, + systemConfiguration.getResourcesHdfsUser().getValue()); + Singleton.get(HdfsResourceManager.class).setHdfs(fileSystem); + } catch (Exception e) { + throw new DinkyException(e); + } + } + } + public static void submit(AppParamConfig config) throws SQLException { + initSystemConfiguration(); + initResource(); + URL.setURLStreamHandlerFactory(new RsURLStreamHandlerFactory()); log.info("{} Start Submit Job:{}", LocalDateTime.now(), config.getTaskId()); AppTask appTask = DBUtil.getTask(config.getTaskId()); @@ -89,15 +155,23 @@ public static void submit(AppParamConfig config) throws SQLException { // .config(JsonUtils.toMap(appTask.getConfigJson())) .build(); + Executor executor = ExecutorFactory.buildAppStreamExecutor(executorConfig); + + log.info("Start Monitor Job"); + FlinkAppUtil.monitorFlinkTask(config.getTaskId()); + // 加载第三方jar //TODO 这里有问题,需要修一修 // loadDep(appTask.getType(), // config.getTaskId(),DBUtil.getSysConfig(Status.SYS_ENV_SETTINGS_DINKYADDR.getKey()), executorConfig); - log.info("The job configuration is as follows: {}", executorConfig); String[] statements = - SqlUtil.getStatements(sql, DBUtil.getSysConfig(Status.SYS_FLINK_SETTINGS_SQLSEPARATOR.getKey())); - excuteJob(executorConfig, statements); + SqlUtil.getStatements(sql, SystemConfiguration.getInstances().getSqlSeparator()); + if (Dialect.FLINK_JAR == appTask.getDialect()) { + executeJarJob(appTask.getType(), executor, statements); + } else { + executeJob(executor, statements); + } } public static String buildSql(AppTask appTask) throws SQLException { @@ -209,9 +283,24 @@ public static boolean downloadFile(String url, String path) throws IOException { } } - public static void excuteJob(ExecutorConfig executorConfig, String[] statements) { + @SneakyThrows + public static void executeJarJob(String type, Executor executor, String[] statements) { + for (int i = 0; i < statements.length; i++) { + String sqlStatement = executor.pretreatStatement(statements[i]); + if (ExecuteJarParseStrategy.INSTANCE.match(sqlStatement)) { + ExecuteJarOperation executeJarOperation = new ExecuteJarOperation(sqlStatement); + executeJarOperation.execute(executor.getCustomTableEnvironment()); + break; + } else if (Operations.getOperationType(sqlStatement) == SqlType.ADD + && "kubernetes-application".equals(type)) { + executor.addJar(AddJarSqlParseStrategy.getInfo(sqlStatement)); + } + } + } + + public static void executeJob(Executor executor, String[] statements) { - Executor executor = ExecutorFactory.buildAppStreamExecutor(executorConfig); + ExecutorConfig executorConfig = executor.getExecutorConfig(); List ddl = new ArrayList<>(); List trans = new ArrayList<>(); List execute = new ArrayList<>(); @@ -228,11 +317,6 @@ public static void excuteJob(ExecutorConfig executorConfig, String[] statements) if (!executorConfig.isUseStatementSet()) { break; } - } else if (operationType.equals(SqlType.EXECUTE)) { - execute.add(new StatementParam(statement, operationType)); - if (!executorConfig.isUseStatementSet()) { - break; - } } else { ddl.add(new StatementParam(statement, operationType)); } diff --git a/dinky-app/dinky-app-base/src/main/java/org/dinky/app/model/SysConfig.java b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/model/SysConfig.java new file mode 100644 index 0000000000..8419ecfa59 --- /dev/null +++ b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/model/SysConfig.java @@ -0,0 +1,35 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.dinky.app.model; + +import java.io.Serializable; + +import lombok.Data; + +@Data +public class SysConfig implements Serializable { + private static final long serialVersionUID = 1L; + + private Integer id; + + private String name; + + private String value; +} diff --git a/dinky-app/dinky-app-base/src/main/java/org/dinky/app/resource/BaseResourceManager.java b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/resource/BaseResourceManager.java new file mode 100644 index 0000000000..b144bf9820 --- /dev/null +++ b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/resource/BaseResourceManager.java @@ -0,0 +1,52 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.dinky.app.resource; + +import org.dinky.app.resource.impl.HdfsResourceManager; +import org.dinky.app.resource.impl.OssResourceManager; +import org.dinky.data.model.SystemConfiguration; + +import java.io.InputStream; + +import cn.hutool.core.io.FileUtil; +import cn.hutool.core.lang.Singleton; + +public interface BaseResourceManager { + SystemConfiguration instances = SystemConfiguration.getInstances(); + + InputStream readFile(String path); + + static BaseResourceManager getInstance() { + switch (SystemConfiguration.getInstances().getResourcesModel().getValue()) { + case HDFS: + return Singleton.get(HdfsResourceManager.class); + case OSS: + return Singleton.get(OssResourceManager.class); + default: + return null; + } + } + + default String getFilePath(String path) { + return FileUtil.normalize( + FileUtil.file(instances.getResourcesUploadBasePath().getValue(), path) + .toString()); + } +} diff --git a/dinky-app/dinky-app-base/src/main/java/org/dinky/app/resource/impl/HdfsResourceManager.java b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/resource/impl/HdfsResourceManager.java new file mode 100644 index 0000000000..bb53b27951 --- /dev/null +++ b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/resource/impl/HdfsResourceManager.java @@ -0,0 +1,63 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.dinky.app.resource.impl; + +import org.dinky.app.resource.BaseResourceManager; +import org.dinky.data.exception.BusException; +import org.dinky.data.model.SystemConfiguration; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; + +import cn.hutool.core.util.URLUtil; + +public class HdfsResourceManager implements BaseResourceManager { + FileSystem hdfs; + SystemConfiguration systemConfiguration = SystemConfiguration.getInstances(); + + @Override + public InputStream readFile(String path) { + try { + if (systemConfiguration.getResourcesHdfsDefaultFS().getValue().contains("file:/")) { + return new URL("http://" + systemConfiguration.getDinkyAddr().getValue() + + "/download/downloadFromRs?path=" + URLUtil.encode(path)) + .openStream(); + } + return getHdfs().open(new Path(getFilePath(path))); + } catch (IOException e) { + throw BusException.valueOf("file.read.failed", e); + } + } + + public FileSystem getHdfs() { + if (hdfs == null && instances.getResourcesEnable().getValue()) { + throw BusException.valueOf("Resource configuration error, HDFS is not enabled"); + } + return hdfs; + } + + public synchronized void setHdfs(FileSystem hdfs) { + this.hdfs = hdfs; + } +} diff --git a/dinky-app/dinky-app-base/src/main/java/org/dinky/app/resource/impl/OssResourceManager.java b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/resource/impl/OssResourceManager.java new file mode 100644 index 0000000000..554cc47992 --- /dev/null +++ b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/resource/impl/OssResourceManager.java @@ -0,0 +1,48 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.dinky.app.resource.impl; + +import org.dinky.app.resource.BaseResourceManager; +import org.dinky.data.exception.BusException; +import org.dinky.oss.OssTemplate; + +import java.io.InputStream; + +public class OssResourceManager implements BaseResourceManager { + OssTemplate ossTemplate; + + @Override + public InputStream readFile(String path) { + return getOssTemplate() + .getObject(getOssTemplate().getBucketName(), getFilePath(path)) + .getObjectContent(); + } + + public OssTemplate getOssTemplate() { + if (ossTemplate == null && instances.getResourcesEnable().getValue()) { + throw BusException.valueOf("Resource configuration error, OSS is not enabled"); + } + return ossTemplate; + } + + public void setOssTemplate(OssTemplate ossTemplate) { + this.ossTemplate = ossTemplate; + } +} diff --git a/dinky-app/dinky-app-base/src/main/java/org/dinky/app/url/RsURLConnection.java b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/url/RsURLConnection.java new file mode 100644 index 0000000000..2ed8ebb983 --- /dev/null +++ b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/url/RsURLConnection.java @@ -0,0 +1,50 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.dinky.app.url; + +import org.dinky.app.resource.BaseResourceManager; +import org.dinky.data.exception.BusException; + +import java.io.InputStream; +import java.net.URL; +import java.net.URLConnection; + +public class RsURLConnection extends URLConnection { + private InputStream inputStream; + + @Override + public void connect() { + BaseResourceManager instance = BaseResourceManager.getInstance(); + if (instance == null) { + throw BusException.valueOf("ResourceManager is disabled"); + } + inputStream = instance.readFile(getURL().getPath()); + } + + @Override + public InputStream getInputStream() { + connect(); + return inputStream; + } + + public RsURLConnection(URL url) { + super(url); + } +} diff --git a/dinky-app/dinky-app-base/src/main/java/org/dinky/app/url/RsURLStreamHandler.java b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/url/RsURLStreamHandler.java new file mode 100644 index 0000000000..5523cf62c4 --- /dev/null +++ b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/url/RsURLStreamHandler.java @@ -0,0 +1,32 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.dinky.app.url; + +import java.net.URL; +import java.net.URLConnection; +import java.net.URLStreamHandler; + +public class RsURLStreamHandler extends URLStreamHandler { + + @Override + protected URLConnection openConnection(URL u) { + return new RsURLConnection(u); + } +} diff --git a/dinky-app/dinky-app-base/src/main/java/org/dinky/app/url/RsURLStreamHandlerFactory.java b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/url/RsURLStreamHandlerFactory.java new file mode 100644 index 0000000000..6f6874b2d7 --- /dev/null +++ b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/url/RsURLStreamHandlerFactory.java @@ -0,0 +1,51 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.dinky.app.url; + +import java.net.URLStreamHandler; +import java.net.URLStreamHandlerFactory; + +public class RsURLStreamHandlerFactory implements URLStreamHandlerFactory { + private static final String PREFIX = "sun.net.www.protocol"; + + @Override + public URLStreamHandler createURLStreamHandler(String protocol) { + if ("rs".equals(protocol)) { + return new RsURLStreamHandler(); + } + try { + Class.forName("org.apache.hadoop.fs.FsUrlStreamHandlerFactory"); + } catch (Exception e) { + return null; + } + String name = PREFIX + "." + protocol + ".Handler"; + try { + @SuppressWarnings("deprecation") + Object o = Class.forName(name).newInstance(); + return (URLStreamHandler) o; + } catch (ClassNotFoundException x) { + // ignore + } catch (Exception e) { + // For compatibility, all Exceptions are ignored. + // any number of exceptions can get thrown here + } + return null; + } +} diff --git a/dinky-app/dinky-app-base/src/main/java/org/dinky/app/util/FlinkAppUtil.java b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/util/FlinkAppUtil.java index 5137d9e6ae..da0d8e31f9 100644 --- a/dinky-app/dinky-app-base/src/main/java/org/dinky/app/util/FlinkAppUtil.java +++ b/dinky-app/dinky-app-base/src/main/java/org/dinky/app/util/FlinkAppUtil.java @@ -19,19 +19,20 @@ package org.dinky.app.util; -import org.dinky.app.db.DBUtil; -import org.dinky.constant.FlinkConstant; -import org.dinky.data.enums.JobStatus; -import org.dinky.data.enums.Status; +import org.dinky.context.CustomTableEnvironmentContext; +import org.dinky.data.model.SystemConfiguration; import org.dinky.utils.JsonUtils; +import org.apache.flink.api.common.JobExecutionResult; +import org.apache.flink.api.common.JobStatus; import org.apache.flink.client.deployment.StandaloneClusterId; import org.apache.flink.client.program.rest.RestClusterClient; import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.GlobalConfiguration; -import org.apache.flink.runtime.client.JobStatusMessage; - -import java.util.Collection; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.core.execution.JobListener; +import org.apache.flink.runtime.client.JobCancellationException; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import cn.hutool.core.text.StrFormatter; import cn.hutool.http.HttpUtil; @@ -46,27 +47,36 @@ public class FlinkAppUtil { * If the task is completed, it sends a hook notification and stops monitoring. */ public static void monitorFlinkTask(int taskId) { - boolean isRun = true; - try (RestClusterClient client = createClient()) { - while (isRun) { - Collection jobs = client.listJobs().get(); - if (jobs.isEmpty()) { - log.error("No Flink task found, try again in 2 seconds....."); - } - for (JobStatusMessage job : jobs) { - if (JobStatus.isDone(job.getJobState().toString())) { - sendHook(taskId, job.getJobId().toHexString(), 0); - log.info("hook {} finished.", job.getJobName()); - // There should be only one in application mode, so stop monitoring here - isRun = false; + StreamExecutionEnvironment streamExecutionEnvironment = + CustomTableEnvironmentContext.get().getStreamExecutionEnvironment(); + streamExecutionEnvironment.registerJobListener(new JobListener() { + @Override + public void onJobSubmitted(JobClient jobClient, Throwable throwable) { + jobClient.getJobExecutionResult().thenAccept(jobExecutionResult -> finshedHook(jobClient, taskId)); + jobClient.getJobStatus().thenAccept(job -> { + if (job == JobStatus.FINISHED) { + finshedHook(jobClient, taskId); } + }); + } + + @Override + public void onJobExecuted(JobExecutionResult jobExecutionResult, Throwable throwable) { + if (throwable instanceof JobCancellationException) { + // todo cancel task + } else { + // other exception } - Thread.sleep(5000); } - } catch (Exception e) { - // If an exception is thrown, it will cause the k8s pod to trigger a restart, - // resulting in an inability to exit normally - log.error("hook failed:", e); + }); + } + + private static void finshedHook(JobClient jobClient, int taskId) { + try { + sendHook(taskId, jobClient.getJobID().toHexString(), 0); + log.info("hook finished."); + } catch (InterruptedException e) { + throw new RuntimeException(e); } } @@ -79,9 +89,9 @@ public static void monitorFlinkTask(int taskId) { */ private static void sendHook(int taskId, String jobId, int reTryCount) throws InterruptedException { try { - String dinkyAddr = DBUtil.getSysConfig(Status.SYS_ENV_SETTINGS_DINKYADDR.getKey()); - String url = StrFormatter.format( - "http://{}/api/jobInstance/hookJobDone?taskId={}&jobId={}", dinkyAddr, taskId, jobId); + String dinkyAddr = SystemConfiguration.getInstances().getDinkyAddr().getValue(); + String url = + StrFormatter.format("{}/api/jobInstance/hookJobDone?taskId={}&jobId={}", dinkyAddr, taskId, jobId); String resultStr = HttpUtil.get(url); // TODO 这里应该使用Result实体类,但是Result.class不在comm里,迁移改动太大,暂时不搞 String code = JsonUtils.parseObject(resultStr).get("code").toString(); @@ -101,17 +111,16 @@ private static void sendHook(int taskId, String jobId, int reTryCount) throws In /** * Create a REST cluster client for Flink. + * * @return * @throws Exception */ private static RestClusterClient createClient() throws Exception { - Configuration config; - Configuration fromEnvConfig = GlobalConfiguration.loadConfiguration(); - if (!fromEnvConfig.keySet().isEmpty()) { - config = fromEnvConfig; - } else { - config = GlobalConfiguration.loadConfiguration(FlinkConstant.DEFAULT_FLINK_HOME); - } - return new RestClusterClient<>(config, StandaloneClusterId.getInstance()); + ReadableConfig config = CustomTableEnvironmentContext.get() + .getStreamExecutionEnvironment() + .getConfiguration(); + Configuration configuration = new Configuration((Configuration) config); + + return new RestClusterClient<>(configuration, StandaloneClusterId.getInstance()); } } diff --git a/dinky-client/dinky-client-1.14/pom.xml b/dinky-client/dinky-client-1.14/pom.xml index 4a6e807f45..3b0965a187 100644 --- a/dinky-client/dinky-client-1.14/pom.xml +++ b/dinky-client/dinky-client-1.14/pom.xml @@ -35,10 +35,6 @@ org.dinky dinky-client-base - - org.dinky - dinky-common - org.dinky dinky-flink-1.14 diff --git a/dinky-client/dinky-client-1.14/src/main/java/org/apache/flink/yarn/Utils.java b/dinky-client/dinky-client-1.14/src/main/java/org/apache/flink/yarn/Utils.java new file mode 100644 index 0000000000..0598def897 --- /dev/null +++ b/dinky-client/dinky-client-1.14/src/main/java/org/apache/flink/yarn/Utils.java @@ -0,0 +1,658 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.flink.yarn; + +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.yarn.YarnConfigKeys.ENV_FLINK_CLASSPATH; +import static org.apache.flink.yarn.YarnConfigKeys.LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.ConfigUtils; +import org.apache.flink.runtime.clusterframework.BootstrapTools; +import org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters; +import org.apache.flink.runtime.util.HadoopUtils; +import org.apache.flink.util.FlinkException; +import org.apache.flink.util.StringUtils; +import org.apache.flink.util.function.FunctionWithException; +import org.apache.flink.yarn.configuration.YarnConfigOptions; +import org.apache.flink.yarn.configuration.YarnResourceManagerDriverConfiguration; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.mapreduce.security.TokenCache; +import org.apache.hadoop.security.Credentials; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.security.token.TokenIdentifier; +import org.apache.hadoop.util.StringInterner; +import org.apache.hadoop.yarn.api.ApplicationConstants; +import org.apache.hadoop.yarn.api.ApplicationConstants.Environment; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.LocalResource; +import org.apache.hadoop.yarn.api.records.LocalResourceType; +import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.security.AMRMTokenIdentifier; +import org.apache.hadoop.yarn.util.ConverterUtils; +import org.apache.hadoop.yarn.util.Records; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Stream; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import cn.hutool.core.util.StrUtil; + +/** Utility class that provides helper methods to work with Apache Hadoop YARN. */ +public final class Utils { + + private static final Logger LOG = LoggerFactory.getLogger(Utils.class); + + /** KRB5 file name populated in YARN container for secure IT run. */ + public static final String KRB5_FILE_NAME = "krb5.conf"; + + /** Yarn site xml file name populated in YARN container for secure IT run. */ + public static final String YARN_SITE_FILE_NAME = "yarn-site.xml"; + + /** The prefixes that Flink adds to the YARN config. */ + private static final String[] FLINK_CONFIG_PREFIXES = {"flink.yarn."}; + + @VisibleForTesting + static final String YARN_RM_FAIR_SCHEDULER_CLAZZ = + "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler"; + + @VisibleForTesting + static final String YARN_RM_SLS_FAIR_SCHEDULER_CLAZZ = "org.apache.hadoop.yarn.sls.scheduler.SLSFairScheduler"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_MB_KEY = "yarn.resource-types.memory-mb.increment-allocation"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_MB_LEGACY_KEY = "yarn.scheduler.increment-allocation-mb"; + + private static final int DEFAULT_YARN_RM_INCREMENT_ALLOCATION_MB = 1024; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_VCORES_KEY = "yarn.resource-types.vcores.increment-allocation"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_VCORES_LEGACY_KEY = "yarn.scheduler.increment-allocation-vcores"; + + private static final int DEFAULT_YARN_RM_INCREMENT_ALLOCATION_VCORES = 1; + + public static void setupYarnClassPath(Configuration conf, Map appMasterEnv) { + addToEnvironment(appMasterEnv, Environment.CLASSPATH.name(), appMasterEnv.get(ENV_FLINK_CLASSPATH)); + String[] applicationClassPathEntries = conf.getStrings( + YarnConfiguration.YARN_APPLICATION_CLASSPATH, + Stream.of(YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH) + .map(x -> StrUtil.removeAll(x, "%")) + .map(x -> "$".equals(StrUtil.subPre(x, 1)) ? x : "$" + x) + .toArray(String[]::new)); + for (String c : applicationClassPathEntries) { + addToEnvironment(appMasterEnv, Environment.CLASSPATH.name(), c.trim()); + } + } + + /** + * Deletes the YARN application files, e.g., Flink binaries, libraries, etc., from the remote + * filesystem. + * + * @param applicationFilesDir The application files directory. + */ + public static void deleteApplicationFiles(final String applicationFilesDir) { + if (!StringUtils.isNullOrWhitespaceOnly(applicationFilesDir)) { + final org.apache.flink.core.fs.Path path = new org.apache.flink.core.fs.Path(applicationFilesDir); + try { + final org.apache.flink.core.fs.FileSystem fileSystem = path.getFileSystem(); + if (!fileSystem.delete(path, true)) { + LOG.error("Deleting yarn application files under {} was unsuccessful.", applicationFilesDir); + } + } catch (final IOException e) { + LOG.error("Could not properly delete yarn application files directory {}.", applicationFilesDir, e); + } + } else { + LOG.debug("No yarn application files directory set. Therefore, cannot clean up the data."); + } + } + + /** + * Creates a YARN resource for the remote object at the given location. + * + * @param remoteRsrcPath remote location of the resource + * @param resourceSize size of the resource + * @param resourceModificationTime last modification time of the resource + * @return YARN resource + */ + static LocalResource registerLocalResource( + Path remoteRsrcPath, + long resourceSize, + long resourceModificationTime, + LocalResourceVisibility resourceVisibility, + LocalResourceType resourceType) { + LocalResource localResource = Records.newRecord(LocalResource.class); + localResource.setResource(ConverterUtils.getYarnUrlFromURI(remoteRsrcPath.toUri())); + localResource.setSize(resourceSize); + localResource.setTimestamp(resourceModificationTime); + localResource.setType(resourceType); + localResource.setVisibility(resourceVisibility); + return localResource; + } + + /** + * Creates a YARN resource for the remote object at the given location. + * + * @param fs remote filesystem + * @param remoteRsrcPath resource path to be registered + * @return YARN resource + */ + private static LocalResource registerLocalResource( + FileSystem fs, Path remoteRsrcPath, LocalResourceType resourceType) throws IOException { + FileStatus jarStat = fs.getFileStatus(remoteRsrcPath); + return registerLocalResource( + remoteRsrcPath, + jarStat.getLen(), + jarStat.getModificationTime(), + LocalResourceVisibility.APPLICATION, + resourceType); + } + + public static void setTokensFor( + ContainerLaunchContext amContainer, List paths, Configuration conf, boolean obtainingDelegationTokens) + throws IOException { + Credentials credentials = new Credentials(); + + if (obtainingDelegationTokens) { + LOG.info("Obtaining delegation tokens for HDFS and HBase."); + // for HDFS + TokenCache.obtainTokensForNamenodes(credentials, paths.toArray(new Path[0]), conf); + // for HBase + obtainTokenForHBase(credentials, conf); + } else { + LOG.info("Delegation token retrieval for HDFS and HBase is disabled."); + } + + // for user + UserGroupInformation currUsr = UserGroupInformation.getCurrentUser(); + + Collection> usrTok = currUsr.getTokens(); + for (Token token : usrTok) { + LOG.info("Adding user token " + token.getService() + " with " + token); + credentials.addToken(token.getService(), token); + } + try (DataOutputBuffer dob = new DataOutputBuffer()) { + credentials.writeTokenStorageToStream(dob); + + if (LOG.isDebugEnabled()) { + LOG.debug("Wrote tokens. Credentials buffer length: " + dob.getLength()); + } + + ByteBuffer securityTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); + amContainer.setTokens(securityTokens); + } + } + + /** Obtain Kerberos security token for HBase. */ + private static void obtainTokenForHBase(Credentials credentials, Configuration conf) throws IOException { + if (UserGroupInformation.isSecurityEnabled()) { + LOG.info("Attempting to obtain Kerberos security token for HBase"); + try { + // ---- + // Intended call: HBaseConfiguration.addHbaseResources(conf); + Class.forName("org.apache.hadoop.hbase.HBaseConfiguration") + .getMethod("addHbaseResources", Configuration.class) + .invoke(null, conf); + // ---- + + LOG.info("HBase security setting: {}", conf.get("hbase.security.authentication")); + + if (!"kerberos".equals(conf.get("hbase.security.authentication"))) { + LOG.info("HBase has not been configured to use Kerberos."); + return; + } + + Token token; + try { + LOG.info("Obtaining Kerberos security token for HBase"); + // ---- + // Intended call: Token token = + // TokenUtil.obtainToken(conf); + token = (Token) Class.forName("org.apache.hadoop.hbase.security.token.TokenUtil") + .getMethod("obtainToken", Configuration.class) + .invoke(null, conf); + // ---- + } catch (NoSuchMethodException e) { + // for HBase 2 + + // ---- + // Intended call: ConnectionFactory connectionFactory = + // ConnectionFactory.createConnection(conf); + Closeable connectionFactory = + (Closeable) Class.forName("org.apache.hadoop.hbase.client.ConnectionFactory") + .getMethod("createConnection", Configuration.class) + .invoke(null, conf); + // ---- + Class connectionClass = Class.forName("org.apache.hadoop.hbase.client.Connection"); + // ---- + // Intended call: Token token = + // TokenUtil.obtainToken(connectionFactory); + token = (Token) Class.forName("org.apache.hadoop.hbase.security.token.TokenUtil") + .getMethod("obtainToken", connectionClass) + .invoke(null, connectionFactory); + // ---- + if (null != connectionFactory) { + connectionFactory.close(); + } + } + + if (token == null) { + LOG.error("No Kerberos security token for HBase available"); + return; + } + + credentials.addToken(token.getService(), token); + LOG.info("Added HBase Kerberos security token to credentials."); + } catch (ClassNotFoundException + | NoSuchMethodException + | IllegalAccessException + | InvocationTargetException e) { + LOG.info( + "HBase is not available (not packaged with this application): {} : \"{}\".", + e.getClass().getSimpleName(), + e.getMessage()); + } + } + } + + /** + * Copied method from org.apache.hadoop.yarn.util.Apps. It was broken by YARN-1824 (2.4.0) and + * fixed for 2.4.1 by https://issues.apache.org/jira/browse/YARN-1931 + */ + public static void addToEnvironment(Map environment, String variable, String value) { + String val = environment.get(variable); + if (val == null) { + val = value; + } else { + val = val + YarnClusterDescriptor.pathSeparator + value; + } + environment.put(StringInterner.weakIntern(variable), StringInterner.weakIntern(val)); + } + + /** + * Resolve keytab path either as absolute path or relative to working directory. + * + * @param workingDir current working directory + * @param keytabPath configured keytab path. + * @return resolved keytab path, or null if not found. + */ + public static String resolveKeytabPath(String workingDir, String keytabPath) { + String keytab = null; + if (keytabPath != null) { + File f; + f = new File(keytabPath); + if (f.exists()) { + keytab = f.getAbsolutePath(); + LOG.info("Resolved keytab path: {}", keytab); + } else { + // try using relative paths, this is the case when the keytab was shipped + // as a local resource + f = new File(workingDir, keytabPath); + if (f.exists()) { + keytab = f.getAbsolutePath(); + LOG.info("Resolved keytab path: {}", keytab); + } else { + LOG.warn("Could not resolve keytab path with: {}", keytabPath); + keytab = null; + } + } + } + return keytab; + } + + /** Private constructor to prevent instantiation. */ + private Utils() { + throw new RuntimeException(); + } + + /** + * Creates the launch context, which describes how to bring up a TaskExecutor / TaskManager + * process in an allocated YARN container. + * + *

This code is extremely YARN specific and registers all the resources that the TaskExecutor + * needs (such as JAR file, config file, ...) and all environment variables in a YARN container + * launch context. The launch context then ensures that those resources will be copied into the + * containers transient working directory. + * + * @param flinkConfig The Flink configuration object. + * @param yarnConfig The YARN configuration object. + * @param configuration The YarnResourceManagerDriver configurations. + * @param tmParams The TaskExecutor container memory parameters. + * @param taskManagerDynamicProperties The dynamic configurations to be updated for the + * TaskExecutors based on client uploaded Flink config. + * @param workingDirectory The current application master container's working directory. + * @param taskManagerMainClass The class with the main method. + * @param log The logger. + * @return The launch context for the TaskManager processes. + * @throws Exception Thrown if the launch context could not be created, for example if the + * resources could not be copied. + */ + static ContainerLaunchContext createTaskExecutorContext( + org.apache.flink.configuration.Configuration flinkConfig, + YarnConfiguration yarnConfig, + YarnResourceManagerDriverConfiguration configuration, + ContaineredTaskManagerParameters tmParams, + String taskManagerDynamicProperties, + String workingDirectory, + Class taskManagerMainClass, + Logger log) + throws Exception { + + // get and validate all relevant variables + + String remoteFlinkJarPath = checkNotNull( + configuration.getFlinkDistJar(), "Environment variable %s not set", YarnConfigKeys.FLINK_DIST_JAR); + + String shipListString = checkNotNull( + configuration.getClientShipFiles(), + "Environment variable %s not set", + YarnConfigKeys.ENV_CLIENT_SHIP_FILES); + + final String remoteKeytabPath = configuration.getRemoteKeytabPath(); + final String localKeytabPath = configuration.getLocalKeytabPath(); + final String keytabPrincipal = configuration.getKeytabPrinciple(); + final String remoteYarnConfPath = configuration.getYarnSiteXMLPath(); + final String remoteKrb5Path = configuration.getKrb5Path(); + + if (log.isDebugEnabled()) { + log.debug("TM:remote keytab path obtained {}", remoteKeytabPath); + log.debug("TM:local keytab path obtained {}", localKeytabPath); + log.debug("TM:keytab principal obtained {}", keytabPrincipal); + log.debug("TM:remote yarn conf path obtained {}", remoteYarnConfPath); + log.debug("TM:remote krb5 path obtained {}", remoteKrb5Path); + } + + String classPathString = checkNotNull( + configuration.getFlinkClasspath(), + "Environment variable %s not set", + YarnConfigKeys.ENV_FLINK_CLASSPATH); + + // register keytab + LocalResource keytabResource = null; + if (remoteKeytabPath != null) { + log.info("TM:Adding keytab {} to the container local resource bucket", remoteKeytabPath); + Path keytabPath = new Path(remoteKeytabPath); + FileSystem fs = keytabPath.getFileSystem(yarnConfig); + keytabResource = registerLocalResource(fs, keytabPath, LocalResourceType.FILE); + } + + // To support Yarn Secure Integration Test Scenario + LocalResource yarnConfResource = null; + if (remoteYarnConfPath != null) { + log.info("TM:Adding remoteYarnConfPath {} to the container local resource bucket", remoteYarnConfPath); + Path yarnConfPath = new Path(remoteYarnConfPath); + FileSystem fs = yarnConfPath.getFileSystem(yarnConfig); + yarnConfResource = registerLocalResource(fs, yarnConfPath, LocalResourceType.FILE); + } + + // register krb5.conf + LocalResource krb5ConfResource = null; + boolean hasKrb5 = false; + if (remoteKrb5Path != null) { + log.info("Adding remoteKrb5Path {} to the container local resource bucket", remoteKrb5Path); + Path krb5ConfPath = new Path(remoteKrb5Path); + FileSystem fs = krb5ConfPath.getFileSystem(yarnConfig); + krb5ConfResource = registerLocalResource(fs, krb5ConfPath, LocalResourceType.FILE); + hasKrb5 = true; + } + + Map taskManagerLocalResources = new HashMap<>(); + + // register Flink Jar with remote HDFS + final YarnLocalResourceDescriptor flinkDistLocalResourceDesc = + YarnLocalResourceDescriptor.fromString(remoteFlinkJarPath); + taskManagerLocalResources.put( + flinkDistLocalResourceDesc.getResourceKey(), flinkDistLocalResourceDesc.toLocalResource()); + + // To support Yarn Secure Integration Test Scenario + if (yarnConfResource != null) { + taskManagerLocalResources.put(YARN_SITE_FILE_NAME, yarnConfResource); + } + if (krb5ConfResource != null) { + taskManagerLocalResources.put(KRB5_FILE_NAME, krb5ConfResource); + } + if (keytabResource != null) { + taskManagerLocalResources.put(localKeytabPath, keytabResource); + } + + // prepare additional files to be shipped + decodeYarnLocalResourceDescriptorListFromString(shipListString) + .forEach(resourceDesc -> + taskManagerLocalResources.put(resourceDesc.getResourceKey(), resourceDesc.toLocalResource())); + + // now that all resources are prepared, we can create the launch context + + log.info("Creating container launch context for TaskManagers"); + + boolean hasLogback = new File(workingDirectory, "logback.xml").exists(); + boolean hasLog4j = new File(workingDirectory, "log4j.properties").exists(); + + String launchCommand = BootstrapTools.getTaskManagerShellCommand( + flinkConfig, + tmParams, + ".", + ApplicationConstants.LOG_DIR_EXPANSION_VAR, + hasLogback, + hasLog4j, + hasKrb5, + taskManagerMainClass, + taskManagerDynamicProperties); + + if (log.isDebugEnabled()) { + log.debug("Starting TaskManagers with command: " + launchCommand); + } else { + log.info("Starting TaskManagers"); + } + + ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class); + ctx.setCommands(Collections.singletonList(launchCommand)); + ctx.setLocalResources(taskManagerLocalResources); + + Map containerEnv = new HashMap<>(); + containerEnv.putAll(tmParams.taskManagerEnv()); + + // add YARN classpath, etc to the container environment + containerEnv.put(ENV_FLINK_CLASSPATH, classPathString); + setupYarnClassPath(yarnConfig, containerEnv); + + containerEnv.put( + YarnConfigKeys.ENV_HADOOP_USER_NAME, + UserGroupInformation.getCurrentUser().getUserName()); + + if (remoteKeytabPath != null && localKeytabPath != null && keytabPrincipal != null) { + containerEnv.put(YarnConfigKeys.REMOTE_KEYTAB_PATH, remoteKeytabPath); + containerEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localKeytabPath); + containerEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, keytabPrincipal); + } else if (localKeytabPath != null && keytabPrincipal != null) { + containerEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localKeytabPath); + containerEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, keytabPrincipal); + } + + ctx.setEnvironment(containerEnv); + + // For TaskManager YARN container context, read the tokens from the jobmanager yarn + // container local file. + // NOTE: must read the tokens from the local file, not from the UGI context, because if UGI + // is login + // using Kerberos keytabs, there is no HDFS delegation token in the UGI context. + final String fileLocation = System.getenv(UserGroupInformation.HADOOP_TOKEN_FILE_LOCATION); + + if (fileLocation != null) { + log.debug("Adding security tokens to TaskExecutor's container launch context."); + + try (DataOutputBuffer dob = new DataOutputBuffer()) { + Credentials cred = Credentials.readTokenStorageFile( + new File(fileLocation), HadoopUtils.getHadoopConfiguration(flinkConfig)); + + // Filter out AMRMToken before setting the tokens to the TaskManager container + // context. + Credentials taskManagerCred = new Credentials(); + Collection> userTokens = cred.getAllTokens(); + for (Token token : userTokens) { + if (!token.getKind().equals(AMRMTokenIdentifier.KIND_NAME)) { + taskManagerCred.addToken(token.getService(), token); + } + } + + taskManagerCred.writeTokenStorageToStream(dob); + ByteBuffer securityTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); + ctx.setTokens(securityTokens); + } catch (Throwable t) { + log.error("Failed to add Hadoop's security tokens.", t); + } + } else { + log.info("Could not set security tokens because Hadoop's token file location is unknown."); + } + + return ctx; + } + + static boolean isRemotePath(String path) throws IOException { + org.apache.flink.core.fs.Path flinkPath = new org.apache.flink.core.fs.Path(path); + return flinkPath.getFileSystem().isDistributedFS(); + } + + private static List decodeYarnLocalResourceDescriptorListFromString(String resources) + throws Exception { + final List resourceDescriptors = new ArrayList<>(); + for (String shipResourceDescStr : resources.split(LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR)) { + if (!shipResourceDescStr.isEmpty()) { + resourceDescriptors.add(YarnLocalResourceDescriptor.fromString(shipResourceDescStr)); + } + } + return resourceDescriptors; + } + + @VisibleForTesting + static Resource getUnitResource(YarnConfiguration yarnConfig) { + final int unitMemMB, unitVcore; + + final String yarnRmSchedulerClazzName = yarnConfig.get(YarnConfiguration.RM_SCHEDULER); + if (Objects.equals(yarnRmSchedulerClazzName, YARN_RM_FAIR_SCHEDULER_CLAZZ) + || Objects.equals(yarnRmSchedulerClazzName, YARN_RM_SLS_FAIR_SCHEDULER_CLAZZ)) { + String propMem = yarnConfig.get(YARN_RM_INCREMENT_ALLOCATION_MB_KEY); + String propVcore = yarnConfig.get(YARN_RM_INCREMENT_ALLOCATION_VCORES_KEY); + + unitMemMB = propMem != null + ? Integer.parseInt(propMem) + : yarnConfig.getInt( + YARN_RM_INCREMENT_ALLOCATION_MB_LEGACY_KEY, DEFAULT_YARN_RM_INCREMENT_ALLOCATION_MB); + unitVcore = propVcore != null + ? Integer.parseInt(propVcore) + : yarnConfig.getInt( + YARN_RM_INCREMENT_ALLOCATION_VCORES_LEGACY_KEY, + DEFAULT_YARN_RM_INCREMENT_ALLOCATION_VCORES); + } else { + unitMemMB = yarnConfig.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB); + unitVcore = yarnConfig.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES); + } + + return Resource.newInstance(unitMemMB, unitVcore); + } + + public static List getQualifiedRemoteSharedPaths( + org.apache.flink.configuration.Configuration configuration, YarnConfiguration yarnConfiguration) + throws IOException, FlinkException { + + return getRemoteSharedPaths(configuration, pathStr -> { + final Path path = new Path(pathStr); + return path.getFileSystem(yarnConfiguration).makeQualified(path); + }); + } + + private static List getRemoteSharedPaths( + org.apache.flink.configuration.Configuration configuration, + FunctionWithException strToPathMapper) + throws IOException, FlinkException { + + final List providedLibDirs = + ConfigUtils.decodeListFromConfig(configuration, YarnConfigOptions.PROVIDED_LIB_DIRS, strToPathMapper); + + for (Path path : providedLibDirs) { + if (!Utils.isRemotePath(path.toString())) { + throw new FlinkException("The \"" + + YarnConfigOptions.PROVIDED_LIB_DIRS.key() + + "\" should only contain" + + " dirs accessible from all worker nodes, while the \"" + + path + + "\" is local."); + } + } + return providedLibDirs; + } + + public static YarnConfiguration getYarnAndHadoopConfiguration( + org.apache.flink.configuration.Configuration flinkConfig) { + final YarnConfiguration yarnConfig = getYarnConfiguration(flinkConfig); + yarnConfig.addResource(HadoopUtils.getHadoopConfiguration(flinkConfig)); + + return yarnConfig; + } + + /** + * Add additional config entries from the flink config to the yarn config. + * + * @param flinkConfig The Flink configuration object. + * @return The yarn configuration. + */ + public static YarnConfiguration getYarnConfiguration(org.apache.flink.configuration.Configuration flinkConfig) { + final YarnConfiguration yarnConfig = new YarnConfiguration(); + + for (String key : flinkConfig.keySet()) { + for (String prefix : FLINK_CONFIG_PREFIXES) { + if (key.startsWith(prefix)) { + String newKey = key.substring("flink.".length()); + String value = flinkConfig.getString(key, null); + yarnConfig.set(newKey, value); + LOG.debug("Adding Flink config entry for {} as {}={} to Yarn config", key, newKey, value); + } + } + } + + return yarnConfig; + } +} diff --git a/dinky-client/dinky-client-1.14/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java b/dinky-client/dinky-client-1.14/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java new file mode 100644 index 0000000000..745a634413 --- /dev/null +++ b/dinky-client/dinky-client-1.14/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java @@ -0,0 +1,1634 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.flink.yarn; + +import static org.apache.flink.configuration.ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR; +import static org.apache.flink.configuration.ConfigConstants.ENV_FLINK_LIB_DIR; +import static org.apache.flink.runtime.entrypoint.component.FileJobGraphRetriever.JOB_GRAPH_FILE_PATH; +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.yarn.YarnConfigKeys.LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.cache.DistributedCache; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.client.deployment.ClusterDeploymentException; +import org.apache.flink.client.deployment.ClusterDescriptor; +import org.apache.flink.client.deployment.ClusterRetrieveException; +import org.apache.flink.client.deployment.ClusterSpecification; +import org.apache.flink.client.deployment.application.ApplicationConfiguration; +import org.apache.flink.client.program.ClusterClientProvider; +import org.apache.flink.client.program.rest.RestClusterClient; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigUtils; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ConfigurationUtils; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.configuration.HighAvailabilityOptions; +import org.apache.flink.configuration.IllegalConfigurationException; +import org.apache.flink.configuration.JobManagerOptions; +import org.apache.flink.configuration.PipelineOptions; +import org.apache.flink.configuration.ResourceManagerOptions; +import org.apache.flink.configuration.RestOptions; +import org.apache.flink.configuration.SecurityOptions; +import org.apache.flink.core.plugin.PluginConfig; +import org.apache.flink.core.plugin.PluginUtils; +import org.apache.flink.runtime.clusterframework.BootstrapTools; +import org.apache.flink.runtime.entrypoint.ClusterEntrypoint; +import org.apache.flink.runtime.jobgraph.JobGraph; +import org.apache.flink.runtime.jobmanager.HighAvailabilityMode; +import org.apache.flink.runtime.jobmanager.JobManagerProcessSpec; +import org.apache.flink.runtime.jobmanager.JobManagerProcessUtils; +import org.apache.flink.runtime.util.HadoopUtils; +import org.apache.flink.util.CollectionUtil; +import org.apache.flink.util.FlinkException; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.ShutdownHookUtil; +import org.apache.flink.util.StringUtils; +import org.apache.flink.yarn.configuration.YarnConfigOptions; +import org.apache.flink.yarn.configuration.YarnConfigOptionsInternal; +import org.apache.flink.yarn.configuration.YarnDeploymentTarget; +import org.apache.flink.yarn.configuration.YarnLogConfigUtil; +import org.apache.flink.yarn.entrypoint.YarnApplicationClusterEntryPoint; +import org.apache.flink.yarn.entrypoint.YarnJobClusterEntrypoint; +import org.apache.flink.yarn.entrypoint.YarnSessionClusterEntrypoint; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.yarn.api.ApplicationConstants; +import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.ApplicationReport; +import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; +import org.apache.hadoop.yarn.api.records.LocalResourceType; +import org.apache.hadoop.yarn.api.records.NodeReport; +import org.apache.hadoop.yarn.api.records.NodeState; +import org.apache.hadoop.yarn.api.records.Priority; +import org.apache.hadoop.yarn.api.records.QueueInfo; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.YarnApplicationState; +import org.apache.hadoop.yarn.api.records.YarnClusterMetrics; +import org.apache.hadoop.yarn.client.api.YarnClient; +import org.apache.hadoop.yarn.client.api.YarnClientApplication; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.util.ConverterUtils; +import org.apache.hadoop.yarn.util.Records; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.PrintStream; +import java.io.UnsupportedEncodingException; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.net.URI; +import java.net.URLDecoder; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * The descriptor with deployment information for deploying a Flink cluster on Yarn. + */ +public class YarnClusterDescriptor implements ClusterDescriptor { + private static final Logger LOG = LoggerFactory.getLogger(YarnClusterDescriptor.class); + + private final YarnConfiguration yarnConfiguration; + public static final String pathSeparator = ":"; + + private final YarnClient yarnClient; + + private final YarnClusterInformationRetriever yarnClusterInformationRetriever; + + /** + * True if the descriptor must not shut down the YarnClient. + */ + private final boolean sharedYarnClient; + + /** + * Lazily initialized list of files to ship. + */ + private final List shipFiles = new LinkedList<>(); + + private final List shipArchives = new LinkedList<>(); + + private final String yarnQueue; + + private Path flinkJarPath; + + private final Configuration flinkConfiguration; + + private final String customName; + + private final String nodeLabel; + + private final String applicationType; + + private YarnConfigOptions.UserJarInclusion userJarInclusion; + + public YarnClusterDescriptor( + Configuration flinkConfiguration, + YarnConfiguration yarnConfiguration, + YarnClient yarnClient, + YarnClusterInformationRetriever yarnClusterInformationRetriever, + boolean sharedYarnClient) { + + this.yarnConfiguration = Preconditions.checkNotNull(yarnConfiguration); + this.yarnClient = Preconditions.checkNotNull(yarnClient); + this.yarnClusterInformationRetriever = Preconditions.checkNotNull(yarnClusterInformationRetriever); + this.sharedYarnClient = sharedYarnClient; + + this.flinkConfiguration = Preconditions.checkNotNull(flinkConfiguration); + this.userJarInclusion = getUserJarInclusionMode(flinkConfiguration); + + getLocalFlinkDistPath(flinkConfiguration).ifPresent(this::setLocalJarPath); + decodeFilesToShipToCluster(flinkConfiguration, YarnConfigOptions.SHIP_FILES) + .ifPresent(this::addShipFiles); + decodeFilesToShipToCluster(flinkConfiguration, YarnConfigOptions.SHIP_ARCHIVES) + .ifPresent(this::addShipArchives); + + this.yarnQueue = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_QUEUE); + this.customName = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_NAME); + this.applicationType = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_TYPE); + this.nodeLabel = flinkConfiguration.getString(YarnConfigOptions.NODE_LABEL); + } + + private Optional> decodeFilesToShipToCluster( + final Configuration configuration, final ConfigOption> configOption) { + checkNotNull(configuration); + checkNotNull(configOption); + + final List files = ConfigUtils.decodeListFromConfig(configuration, configOption, File::new); + return files.isEmpty() ? Optional.empty() : Optional.of(files); + } + + private Optional getLocalFlinkDistPath(final Configuration configuration) { + final String localJarPath = configuration.getString(YarnConfigOptions.FLINK_DIST_JAR); + if (localJarPath != null) { + return Optional.of(new Path(localJarPath)); + } + + LOG.info("No path for the flink jar passed. Using the location of " + getClass() + " to locate the jar"); + + // check whether it's actually a jar file --> when testing we execute this class without a + // flink-dist jar + final String decodedPath = getDecodedJarPath(); + return decodedPath.endsWith(".jar") ? Optional.of(new Path(new File(decodedPath).toURI())) : Optional.empty(); + } + + private String getDecodedJarPath() { + final String encodedJarPath = YarnClusterClientFactory.class + .getProtectionDomain() + .getCodeSource() + .getLocation() + .getPath(); + try { + return URLDecoder.decode(encodedJarPath, Charset.defaultCharset().name()); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException("Couldn't decode the encoded Flink dist jar path: " + + encodedJarPath + + " You can supply a path manually via the command line."); + } + } + + @VisibleForTesting + List getShipFiles() { + return shipFiles; + } + + public YarnClient getYarnClient() { + return yarnClient; + } + + /** + * The class to start the application master with. This class runs the main method in case of + * session cluster. + */ + protected String getYarnSessionClusterEntrypoint() { + return YarnSessionClusterEntrypoint.class.getName(); + } + + /** + * The class to start the application master with. This class runs the main method in case of + * the job cluster. + */ + protected String getYarnJobClusterEntrypoint() { + return YarnJobClusterEntrypoint.class.getName(); + } + + public Configuration getFlinkConfiguration() { + return flinkConfiguration; + } + + public void setLocalJarPath(Path localJarPath) { + if (!localJarPath.toString().endsWith("jar")) { + throw new IllegalArgumentException( + "The passed jar path ('" + localJarPath + "') does not end with the 'jar' extension"); + } + this.flinkJarPath = localJarPath; + } + + /** + * Adds the given files to the list of files to ship. + * + *

Note that any file matching "flink-dist*.jar" will be excluded from the upload by + * {@link YarnApplicationFileUploader#registerMultipleLocalResources(Collection, String, + * LocalResourceType)} since we upload the Flink uber jar ourselves and do not need to deploy it + * multiple times. + * + * @param shipFiles files to ship + */ + public void addShipFiles(List shipFiles) { + checkArgument( + userJarInclusion != YarnConfigOptions.UserJarInclusion.DISABLED + || isUsrLibDirIncludedInShipFiles(shipFiles), + "This is an illegal ship directory : %s. When setting the %s to %s the name of ship directory can not be %s.", + ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR, + YarnConfigOptions.CLASSPATH_INCLUDE_USER_JAR.key(), + YarnConfigOptions.UserJarInclusion.DISABLED, + ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR); + this.shipFiles.addAll(shipFiles); + } + + private void addShipArchives(List shipArchives) { + checkArgument(isArchiveOnlyIncludedInShipArchiveFiles(shipArchives), "Non-archive files are included."); + this.shipArchives.addAll(shipArchives); + } + + private static boolean isArchiveOnlyIncludedInShipArchiveFiles(List shipFiles) { + return shipFiles.stream() + .filter(File::isFile) + .map(File::getName) + .map(String::toLowerCase) + .allMatch(name -> name.endsWith(".tar.gz") + || name.endsWith(".tar") + || name.endsWith(".tgz") + || name.endsWith(".dst") + || name.endsWith(".jar") + || name.endsWith(".zip")); + } + + private void isReadyForDeployment(ClusterSpecification clusterSpecification) throws Exception { + + if (this.flinkJarPath == null) { + throw new YarnDeploymentException("The Flink jar path is null"); + } + if (this.flinkConfiguration == null) { + throw new YarnDeploymentException("Flink configuration object has not been set"); + } + + // Check if we don't exceed YARN's maximum virtual cores. + final int numYarnMaxVcores = yarnClusterInformationRetriever.getMaxVcores(); + + int configuredAmVcores = flinkConfiguration.getInteger(YarnConfigOptions.APP_MASTER_VCORES); + if (configuredAmVcores > numYarnMaxVcores) { + throw new IllegalConfigurationException(String.format( + "The number of requested virtual cores for application master %d" + + " exceeds the maximum number of virtual cores %d available in the Yarn Cluster.", + configuredAmVcores, numYarnMaxVcores)); + } + + int configuredVcores = + flinkConfiguration.getInteger(YarnConfigOptions.VCORES, clusterSpecification.getSlotsPerTaskManager()); + // don't configure more than the maximum configured number of vcores + if (configuredVcores > numYarnMaxVcores) { + throw new IllegalConfigurationException(String.format( + "The number of requested virtual cores per node %d" + + " exceeds the maximum number of virtual cores %d available in the Yarn Cluster." + + " Please note that the number of virtual cores is set to the number of task slots by default" + + " unless configured in the Flink config with '%s.'", + configuredVcores, numYarnMaxVcores, YarnConfigOptions.VCORES.key())); + } + + // check if required Hadoop environment variables are set. If not, warn user + if (System.getenv("HADOOP_CONF_DIR") == null && System.getenv("YARN_CONF_DIR") == null) { + LOG.warn("Neither the HADOOP_CONF_DIR nor the YARN_CONF_DIR environment variable is set. " + + "The Flink YARN Client needs one of these to be set to properly load the Hadoop " + + "configuration for accessing YARN."); + } + } + + public String getNodeLabel() { + return nodeLabel; + } + + // ------------------------------------------------------------- + // Lifecycle management + // ------------------------------------------------------------- + + @Override + public void close() { + if (!sharedYarnClient) { + yarnClient.stop(); + } + } + + // ------------------------------------------------------------- + // ClusterClient overrides + // ------------------------------------------------------------- + + @Override + public ClusterClientProvider retrieve(ApplicationId applicationId) throws ClusterRetrieveException { + + try { + // check if required Hadoop environment variables are set. If not, warn user + if (System.getenv("HADOOP_CONF_DIR") == null && System.getenv("YARN_CONF_DIR") == null) { + LOG.warn("Neither the HADOOP_CONF_DIR nor the YARN_CONF_DIR environment variable is set." + + "The Flink YARN Client needs one of these to be set to properly load the Hadoop " + + "configuration for accessing YARN."); + } + + final ApplicationReport report = yarnClient.getApplicationReport(applicationId); + + if (report.getFinalApplicationStatus() != FinalApplicationStatus.UNDEFINED) { + // Flink cluster is not running anymore + LOG.error( + "The application {} doesn't run anymore. It has previously completed with final status: {}", + applicationId, + report.getFinalApplicationStatus()); + throw new RuntimeException("The Yarn application " + applicationId + " doesn't run anymore."); + } + + setClusterEntrypointInfoToConfig(report); + + return () -> { + try { + return new RestClusterClient<>(flinkConfiguration, report.getApplicationId()); + } catch (Exception e) { + throw new RuntimeException("Couldn't retrieve Yarn cluster", e); + } + }; + } catch (Exception e) { + throw new ClusterRetrieveException("Couldn't retrieve Yarn cluster", e); + } + } + + @Override + public ClusterClientProvider deploySessionCluster(ClusterSpecification clusterSpecification) + throws ClusterDeploymentException { + try { + return deployInternal( + clusterSpecification, "Flink session cluster", getYarnSessionClusterEntrypoint(), null, false); + } catch (Exception e) { + throw new ClusterDeploymentException("Couldn't deploy Yarn session cluster", e); + } + } + + @Override + public ClusterClientProvider deployApplicationCluster( + final ClusterSpecification clusterSpecification, final ApplicationConfiguration applicationConfiguration) + throws ClusterDeploymentException { + checkNotNull(clusterSpecification); + checkNotNull(applicationConfiguration); + + final YarnDeploymentTarget deploymentTarget = YarnDeploymentTarget.fromConfig(flinkConfiguration); + if (YarnDeploymentTarget.APPLICATION != deploymentTarget) { + throw new ClusterDeploymentException("Couldn't deploy Yarn Application Cluster." + + " Expected deployment.target=" + + YarnDeploymentTarget.APPLICATION.getName() + + " but actual one was \"" + + deploymentTarget.getName() + + "\""); + } + + applicationConfiguration.applyToConfiguration(flinkConfiguration); + + final List pipelineJars = + flinkConfiguration.getOptional(PipelineOptions.JARS).orElse(Collections.emptyList()); + Preconditions.checkArgument(pipelineJars.size() == 1, "Should only have one jar"); + + try { + return deployInternal( + clusterSpecification, + "Flink Application Cluster", + YarnApplicationClusterEntryPoint.class.getName(), + null, + false); + } catch (Exception e) { + throw new ClusterDeploymentException("Couldn't deploy Yarn Application Cluster", e); + } + } + + @Override + public ClusterClientProvider deployJobCluster( + ClusterSpecification clusterSpecification, JobGraph jobGraph, boolean detached) + throws ClusterDeploymentException { + try { + return deployInternal( + clusterSpecification, "Flink per-job cluster", getYarnJobClusterEntrypoint(), jobGraph, detached); + } catch (Exception e) { + throw new ClusterDeploymentException("Could not deploy Yarn job cluster.", e); + } + } + + @Override + public void killCluster(ApplicationId applicationId) throws FlinkException { + try { + yarnClient.killApplication(applicationId); + + try (final FileSystem fs = FileSystem.get(yarnConfiguration)) { + final Path applicationDir = + YarnApplicationFileUploader.getApplicationDirPath(getStagingDir(fs), applicationId); + + Utils.deleteApplicationFiles(applicationDir.toUri().toString()); + } + + } catch (YarnException | IOException e) { + throw new FlinkException("Could not kill the Yarn Flink cluster with id " + applicationId + '.', e); + } + } + + /** + * This method will block until the ApplicationMaster/JobManager have been deployed on YARN. + * + * @param clusterSpecification Initial cluster specification for the Flink cluster to be + * deployed + * @param applicationName name of the Yarn application to start + * @param yarnClusterEntrypoint Class name of the Yarn cluster entry point. + * @param jobGraph A job graph which is deployed with the Flink cluster, {@code null} if none + * @param detached True if the cluster should be started in detached mode + */ + private ClusterClientProvider deployInternal( + ClusterSpecification clusterSpecification, + String applicationName, + String yarnClusterEntrypoint, + @Nullable JobGraph jobGraph, + boolean detached) + throws Exception { + + final UserGroupInformation currentUser = UserGroupInformation.getCurrentUser(); + if (HadoopUtils.isKerberosSecurityEnabled(currentUser)) { + boolean useTicketCache = flinkConfiguration.getBoolean(SecurityOptions.KERBEROS_LOGIN_USETICKETCACHE); + + if (!HadoopUtils.areKerberosCredentialsValid(currentUser, useTicketCache)) { + throw new RuntimeException("Hadoop security with Kerberos is enabled but the login user " + + "does not have Kerberos credentials or delegation tokens!"); + } + + final boolean fetchToken = flinkConfiguration.getBoolean(SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN); + final boolean yarnAccessFSEnabled = + !CollectionUtil.isNullOrEmpty(flinkConfiguration.get(YarnConfigOptions.YARN_ACCESS)); + if (!fetchToken && yarnAccessFSEnabled) { + throw new IllegalConfigurationException(String.format( + "When %s is disabled, %s must be disabled as well.", + SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN.key(), YarnConfigOptions.YARN_ACCESS.key())); + } + } + + isReadyForDeployment(clusterSpecification); + + // ------------------ Check if the specified queue exists -------------------- + + checkYarnQueues(yarnClient); + + // ------------------ Check if the YARN ClusterClient has the requested resources + // -------------- + + // Create application via yarnClient + final YarnClientApplication yarnApplication = yarnClient.createApplication(); + final GetNewApplicationResponse appResponse = yarnApplication.getNewApplicationResponse(); + + Resource maxRes = appResponse.getMaximumResourceCapability(); + + final ClusterResourceDescription freeClusterMem; + try { + freeClusterMem = getCurrentFreeClusterResources(yarnClient); + } catch (YarnException | IOException e) { + failSessionDuringDeployment(yarnClient, yarnApplication); + throw new YarnDeploymentException("Could not retrieve information about free cluster resources.", e); + } + + final int yarnMinAllocationMB = yarnConfiguration.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB); + if (yarnMinAllocationMB <= 0) { + throw new YarnDeploymentException("The minimum allocation memory " + + "(" + + yarnMinAllocationMB + + " MB) configured via '" + + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB + + "' should be greater than 0."); + } + + final ClusterSpecification validClusterSpecification; + try { + validClusterSpecification = + validateClusterResources(clusterSpecification, yarnMinAllocationMB, maxRes, freeClusterMem); + } catch (YarnDeploymentException yde) { + failSessionDuringDeployment(yarnClient, yarnApplication); + throw yde; + } + + LOG.info("Cluster specification: {}", validClusterSpecification); + + final ClusterEntrypoint.ExecutionMode executionMode = + detached ? ClusterEntrypoint.ExecutionMode.DETACHED : ClusterEntrypoint.ExecutionMode.NORMAL; + + flinkConfiguration.setString(ClusterEntrypoint.INTERNAL_CLUSTER_EXECUTION_MODE, executionMode.toString()); + + ApplicationReport report = startAppMaster( + flinkConfiguration, + applicationName, + yarnClusterEntrypoint, + jobGraph, + yarnClient, + yarnApplication, + validClusterSpecification); + + // print the application id for user to cancel themselves. + if (detached) { + final ApplicationId yarnApplicationId = report.getApplicationId(); + logDetachedClusterInformation(yarnApplicationId, LOG); + } + + setClusterEntrypointInfoToConfig(report); + + return () -> { + try { + return new RestClusterClient<>(flinkConfiguration, report.getApplicationId()); + } catch (Exception e) { + throw new RuntimeException("Error while creating RestClusterClient.", e); + } + }; + } + + private ClusterSpecification validateClusterResources( + ClusterSpecification clusterSpecification, + int yarnMinAllocationMB, + Resource maximumResourceCapability, + ClusterResourceDescription freeClusterResources) + throws YarnDeploymentException { + + int jobManagerMemoryMb = clusterSpecification.getMasterMemoryMB(); + final int taskManagerMemoryMb = clusterSpecification.getTaskManagerMemoryMB(); + + logIfComponentMemNotIntegerMultipleOfYarnMinAllocation("JobManager", jobManagerMemoryMb, yarnMinAllocationMB); + logIfComponentMemNotIntegerMultipleOfYarnMinAllocation("TaskManager", taskManagerMemoryMb, yarnMinAllocationMB); + + // set the memory to minAllocationMB to do the next checks correctly + if (jobManagerMemoryMb < yarnMinAllocationMB) { + jobManagerMemoryMb = yarnMinAllocationMB; + } + + final String note = + "Please check the 'yarn.scheduler.maximum-allocation-mb' and the 'yarn.nodemanager.resource.memory-mb' configuration values\n"; + if (jobManagerMemoryMb > maximumResourceCapability.getMemory()) { + throw new YarnDeploymentException( + "The cluster does not have the requested resources for the JobManager available!\n" + + "Maximum Memory: " + + maximumResourceCapability.getMemory() + + "MB Requested: " + + jobManagerMemoryMb + + "MB. " + + note); + } + + if (taskManagerMemoryMb > maximumResourceCapability.getMemory()) { + throw new YarnDeploymentException( + "The cluster does not have the requested resources for the TaskManagers available!\n" + + "Maximum Memory: " + + maximumResourceCapability.getMemory() + + " Requested: " + + taskManagerMemoryMb + + "MB. " + + note); + } + + final String noteRsc = + "\nThe Flink YARN client will try to allocate the YARN session, but maybe not all TaskManagers are " + + "connecting from the beginning because the resources are currently not available in the cluster. " + + "The allocation might take more time than usual because the Flink YARN client needs to wait until " + + "the resources become available."; + + if (taskManagerMemoryMb > freeClusterResources.containerLimit) { + LOG.warn("The requested amount of memory for the TaskManagers (" + + taskManagerMemoryMb + + "MB) is more than " + + "the largest possible YARN container: " + + freeClusterResources.containerLimit + + noteRsc); + } + if (jobManagerMemoryMb > freeClusterResources.containerLimit) { + LOG.warn("The requested amount of memory for the JobManager (" + + jobManagerMemoryMb + + "MB) is more than " + + "the largest possible YARN container: " + + freeClusterResources.containerLimit + + noteRsc); + } + + return new ClusterSpecification.ClusterSpecificationBuilder() + .setMasterMemoryMB(jobManagerMemoryMb) + .setTaskManagerMemoryMB(taskManagerMemoryMb) + .setSlotsPerTaskManager(clusterSpecification.getSlotsPerTaskManager()) + .createClusterSpecification(); + } + + private void logIfComponentMemNotIntegerMultipleOfYarnMinAllocation( + String componentName, int componentMemoryMB, int yarnMinAllocationMB) { + int normalizedMemMB = + (componentMemoryMB + (yarnMinAllocationMB - 1)) / yarnMinAllocationMB * yarnMinAllocationMB; + if (normalizedMemMB <= 0) { + normalizedMemMB = yarnMinAllocationMB; + } + if (componentMemoryMB != normalizedMemMB) { + LOG.info( + "The configured {} memory is {} MB. YARN will allocate {} MB to make up an integer multiple of its " + + "minimum allocation memory ({} MB, configured via 'yarn.scheduler.minimum-allocation-mb'). The extra {} MB " + + "may not be used by Flink.", + componentName, + componentMemoryMB, + normalizedMemMB, + yarnMinAllocationMB, + normalizedMemMB - componentMemoryMB); + } + } + + private void checkYarnQueues(YarnClient yarnClient) { + try { + List queues = yarnClient.getAllQueues(); + if (queues.size() > 0 + && this.yarnQueue != null) { // check only if there are queues configured in yarn and for + // this session. + boolean queueFound = false; + for (QueueInfo queue : queues) { + if (queue.getQueueName().equals(this.yarnQueue) + || queue.getQueueName().equals("root." + this.yarnQueue)) { + queueFound = true; + break; + } + } + if (!queueFound) { + String queueNames = ""; + for (QueueInfo queue : queues) { + queueNames += queue.getQueueName() + ", "; + } + LOG.warn("The specified queue '" + + this.yarnQueue + + "' does not exist. " + + "Available queues: " + + queueNames); + } + } else { + LOG.debug("The YARN cluster does not have any queues configured"); + } + } catch (Throwable e) { + LOG.warn("Error while getting queue information from YARN: " + e.getMessage()); + if (LOG.isDebugEnabled()) { + LOG.debug("Error details", e); + } + } + } + + private ApplicationReport startAppMaster( + Configuration configuration, + String applicationName, + String yarnClusterEntrypoint, + JobGraph jobGraph, + YarnClient yarnClient, + YarnClientApplication yarnApplication, + ClusterSpecification clusterSpecification) + throws Exception { + + // ------------------ Initialize the file systems ------------------------- + + org.apache.flink.core.fs.FileSystem.initialize( + configuration, PluginUtils.createPluginManagerFromRootFolder(configuration)); + + final FileSystem fs = FileSystem.get(yarnConfiguration); + + // hard coded check for the GoogleHDFS client because its not overriding the getScheme() + // method. + if (!fs.getClass().getSimpleName().equals("GoogleHadoopFileSystem") + && fs.getScheme().startsWith("file")) { + LOG.warn("The file system scheme is '" + + fs.getScheme() + + "'. This indicates that the " + + "specified Hadoop configuration path is wrong and the system is using the default Hadoop configuration values." + + "The Flink YARN client needs to store its files in a distributed file system"); + } + + ApplicationSubmissionContext appContext = yarnApplication.getApplicationSubmissionContext(); + + final List providedLibDirs = Utils.getQualifiedRemoteSharedPaths(configuration, yarnConfiguration); + + final YarnApplicationFileUploader fileUploader = YarnApplicationFileUploader.from( + fs, getStagingDir(fs), providedLibDirs, appContext.getApplicationId(), getFileReplication()); + + // The files need to be shipped and added to classpath. + Set systemShipFiles = new HashSet<>(shipFiles.size()); + for (File file : shipFiles) { + systemShipFiles.add(file.getAbsoluteFile()); + } + + final String logConfigFilePath = configuration.getString(YarnConfigOptionsInternal.APPLICATION_LOG_CONFIG_FILE); + if (logConfigFilePath != null) { + systemShipFiles.add(new File(logConfigFilePath)); + } + + // Set-up ApplicationSubmissionContext for the application + + final ApplicationId appId = appContext.getApplicationId(); + + // ------------------ Add Zookeeper namespace to local flinkConfiguraton ------ + setHAClusterIdIfNotSet(configuration, appId); + + if (HighAvailabilityMode.isHighAvailabilityModeActivated(configuration)) { + // activate re-execution of failed applications + appContext.setMaxAppAttempts(configuration.getInteger( + YarnConfigOptions.APPLICATION_ATTEMPTS.key(), YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)); + + activateHighAvailabilitySupport(appContext); + } else { + // set number of application retries to 1 in the default case + appContext.setMaxAppAttempts(configuration.getInteger(YarnConfigOptions.APPLICATION_ATTEMPTS.key(), 1)); + } + + final Set userJarFiles = new HashSet<>(); + if (jobGraph != null) { + userJarFiles.addAll(jobGraph.getUserJars().stream() + .map(f -> f.toUri()) + .map(Path::new) + .collect(Collectors.toSet())); + } + + final List jarUrls = ConfigUtils.decodeListFromConfig(configuration, PipelineOptions.JARS, URI::create); + if (jarUrls != null && YarnApplicationClusterEntryPoint.class.getName().equals(yarnClusterEntrypoint)) { + userJarFiles.addAll(jarUrls.stream().map(Path::new).collect(Collectors.toSet())); + } + + // only for per job mode + if (jobGraph != null) { + for (Map.Entry entry : + jobGraph.getUserArtifacts().entrySet()) { + // only upload local files + if (!Utils.isRemotePath(entry.getValue().filePath)) { + Path localPath = new Path(entry.getValue().filePath); + Tuple2 remoteFileInfo = fileUploader.uploadLocalFileToRemote(localPath, entry.getKey()); + jobGraph.setUserArtifactRemotePath(entry.getKey(), remoteFileInfo.f0.toString()); + } + } + + jobGraph.writeUserArtifactEntriesToConfiguration(); + } + + if (providedLibDirs == null || providedLibDirs.isEmpty()) { + addLibFoldersToShipFiles(systemShipFiles); + } + + // Register all files in provided lib dirs as local resources with public visibility + // and upload the remaining dependencies as local resources with APPLICATION visibility. + final List systemClassPaths = fileUploader.registerProvidedLocalResources(); + final List uploadedDependencies = fileUploader.registerMultipleLocalResources( + systemShipFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.FILE); + systemClassPaths.addAll(uploadedDependencies); + + // upload and register ship-only files + // Plugin files only need to be shipped and should not be added to classpath. + if (providedLibDirs == null || providedLibDirs.isEmpty()) { + Set shipOnlyFiles = new HashSet<>(); + addPluginsFoldersToShipFiles(shipOnlyFiles); + fileUploader.registerMultipleLocalResources( + shipOnlyFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.FILE); + } + + if (!shipArchives.isEmpty()) { + fileUploader.registerMultipleLocalResources( + shipArchives.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.ARCHIVE); + } + + // Upload and register user jars + final List userClassPaths = fileUploader.registerMultipleLocalResources( + userJarFiles, + userJarInclusion == YarnConfigOptions.UserJarInclusion.DISABLED + ? ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR + : Path.CUR_DIR, + LocalResourceType.FILE); + + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.ORDER) { + systemClassPaths.addAll(userClassPaths); + } + + // normalize classpath by sorting + Collections.sort(systemClassPaths); + Collections.sort(userClassPaths); + + // classpath assembler + StringBuilder classPathBuilder = new StringBuilder(); + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.FIRST) { + for (String userClassPath : userClassPaths) { + classPathBuilder.append(userClassPath).append(pathSeparator); + } + } + for (String classPath : systemClassPaths) { + classPathBuilder.append(classPath).append(pathSeparator); + } + + // Setup jar for ApplicationMaster + final YarnLocalResourceDescriptor localResourceDescFlinkJar = fileUploader.uploadFlinkDist(flinkJarPath); + classPathBuilder.append(localResourceDescFlinkJar.getResourceKey()).append(pathSeparator); + + // write job graph to tmp file and add it to local resource + // TODO: server use user main method to generate job graph + if (jobGraph != null) { + File tmpJobGraphFile = null; + try { + tmpJobGraphFile = File.createTempFile(appId.toString(), null); + try (FileOutputStream output = new FileOutputStream(tmpJobGraphFile); + ObjectOutputStream obOutput = new ObjectOutputStream(output)) { + obOutput.writeObject(jobGraph); + } + + final String jobGraphFilename = "job.graph"; + configuration.setString(JOB_GRAPH_FILE_PATH, jobGraphFilename); + + fileUploader.registerSingleLocalResource( + jobGraphFilename, new Path(tmpJobGraphFile.toURI()), "", LocalResourceType.FILE, true, false); + classPathBuilder.append(jobGraphFilename).append(pathSeparator); + } catch (Exception e) { + LOG.warn("Add job graph to local resource fail."); + throw e; + } finally { + if (tmpJobGraphFile != null && !tmpJobGraphFile.delete()) { + LOG.warn("Fail to delete temporary file {}.", tmpJobGraphFile.toPath()); + } + } + } + + // Upload the flink configuration + // write out configuration file + File tmpConfigurationFile = null; + try { + tmpConfigurationFile = File.createTempFile(appId + "-flink-conf.yaml", null); + BootstrapTools.writeConfiguration(configuration, tmpConfigurationFile); + + String flinkConfigKey = "flink-conf.yaml"; + fileUploader.registerSingleLocalResource( + flinkConfigKey, + new Path(tmpConfigurationFile.getAbsolutePath()), + "", + LocalResourceType.FILE, + true, + true); + classPathBuilder.append("flink-conf.yaml").append(pathSeparator); + } finally { + if (tmpConfigurationFile != null && !tmpConfigurationFile.delete()) { + LOG.warn("Fail to delete temporary file {}.", tmpConfigurationFile.toPath()); + } + } + + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.LAST) { + for (String userClassPath : userClassPaths) { + classPathBuilder.append(userClassPath).append(pathSeparator); + } + } + + // To support Yarn Secure Integration Test Scenario + // In Integration test setup, the Yarn containers created by YarnMiniCluster does not have + // the Yarn site XML + // and KRB5 configuration files. We are adding these files as container local resources for + // the container + // applications (JM/TMs) to have proper secure cluster setup + Path remoteYarnSiteXmlPath = null; + if (System.getenv("IN_TESTS") != null) { + File f = new File(System.getenv("YARN_CONF_DIR"), Utils.YARN_SITE_FILE_NAME); + LOG.info("Adding Yarn configuration {} to the AM container local resource bucket", f.getAbsolutePath()); + Path yarnSitePath = new Path(f.getAbsolutePath()); + remoteYarnSiteXmlPath = fileUploader + .registerSingleLocalResource( + Utils.YARN_SITE_FILE_NAME, yarnSitePath, "", LocalResourceType.FILE, false, false) + .getPath(); + if (System.getProperty("java.security.krb5.conf") != null) { + configuration.set(SecurityOptions.KERBEROS_KRB5_PATH, System.getProperty("java.security.krb5.conf")); + } + } + + Path remoteKrb5Path = null; + boolean hasKrb5 = false; + String krb5Config = configuration.get(SecurityOptions.KERBEROS_KRB5_PATH); + if (!StringUtils.isNullOrWhitespaceOnly(krb5Config)) { + final File krb5 = new File(krb5Config); + LOG.info("Adding KRB5 configuration {} to the AM container local resource bucket", krb5.getAbsolutePath()); + final Path krb5ConfPath = new Path(krb5.getAbsolutePath()); + remoteKrb5Path = fileUploader + .registerSingleLocalResource( + Utils.KRB5_FILE_NAME, krb5ConfPath, "", LocalResourceType.FILE, false, false) + .getPath(); + hasKrb5 = true; + } + + Path remotePathKeytab = null; + String localizedKeytabPath = null; + String keytab = configuration.getString(SecurityOptions.KERBEROS_LOGIN_KEYTAB); + if (keytab != null) { + boolean localizeKeytab = flinkConfiguration.getBoolean(YarnConfigOptions.SHIP_LOCAL_KEYTAB); + localizedKeytabPath = flinkConfiguration.getString(YarnConfigOptions.LOCALIZED_KEYTAB_PATH); + if (localizeKeytab) { + // Localize the keytab to YARN containers via local resource. + LOG.info("Adding keytab {} to the AM container local resource bucket", keytab); + remotePathKeytab = fileUploader + .registerSingleLocalResource( + localizedKeytabPath, new Path(keytab), "", LocalResourceType.FILE, false, false) + .getPath(); + } else { + // // Assume Keytab is pre-installed in the container. + localizedKeytabPath = flinkConfiguration.getString(YarnConfigOptions.LOCALIZED_KEYTAB_PATH); + } + } + + final JobManagerProcessSpec processSpec = + JobManagerProcessUtils.processSpecFromConfigWithNewOptionToInterpretLegacyHeap( + flinkConfiguration, JobManagerOptions.TOTAL_PROCESS_MEMORY); + final ContainerLaunchContext amContainer = + setupApplicationMasterContainer(yarnClusterEntrypoint, hasKrb5, processSpec); + + // setup security tokens + if (UserGroupInformation.isSecurityEnabled()) { + // set HDFS delegation tokens when security is enabled + LOG.info("Adding delegation token to the AM container."); + final List pathsToObtainToken = new ArrayList<>(); + boolean fetchToken = configuration.getBoolean(SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN); + if (fetchToken) { + List yarnAccessList = + ConfigUtils.decodeListFromConfig(configuration, YarnConfigOptions.YARN_ACCESS, Path::new); + pathsToObtainToken.addAll(yarnAccessList); + pathsToObtainToken.addAll(fileUploader.getRemotePaths()); + } + Utils.setTokensFor(amContainer, pathsToObtainToken, yarnConfiguration, fetchToken); + } + + amContainer.setLocalResources(fileUploader.getRegisteredLocalResources()); + fileUploader.close(); + + // Setup CLASSPATH and environment variables for ApplicationMaster + final Map appMasterEnv = new HashMap<>(); + // set user specified app master environment variables + appMasterEnv.putAll(ConfigurationUtils.getPrefixedKeyValuePairs( + ResourceManagerOptions.CONTAINERIZED_MASTER_ENV_PREFIX, configuration)); + // set Flink app class path + appMasterEnv.put(YarnConfigKeys.ENV_FLINK_CLASSPATH, classPathBuilder.toString()); + + // set Flink on YARN internal configuration values + appMasterEnv.put(YarnConfigKeys.FLINK_DIST_JAR, localResourceDescFlinkJar.toString()); + appMasterEnv.put(YarnConfigKeys.ENV_APP_ID, appId.toString()); + appMasterEnv.put( + YarnConfigKeys.ENV_CLIENT_HOME_DIR, fileUploader.getHomeDir().toString()); + appMasterEnv.put( + YarnConfigKeys.ENV_CLIENT_SHIP_FILES, + encodeYarnLocalResourceDescriptorListToString(fileUploader.getEnvShipResourceList())); + appMasterEnv.put( + YarnConfigKeys.FLINK_YARN_FILES, + fileUploader.getApplicationDir().toUri().toString()); + + // https://github.com/apache/hadoop/blob/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/YarnApplicationSecurity.md#identity-on-an-insecure-cluster-hadoop_user_name + appMasterEnv.put( + YarnConfigKeys.ENV_HADOOP_USER_NAME, + UserGroupInformation.getCurrentUser().getUserName()); + + if (localizedKeytabPath != null) { + appMasterEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localizedKeytabPath); + String principal = configuration.getString(SecurityOptions.KERBEROS_LOGIN_PRINCIPAL); + appMasterEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, principal); + if (remotePathKeytab != null) { + appMasterEnv.put(YarnConfigKeys.REMOTE_KEYTAB_PATH, remotePathKeytab.toString()); + } + } + + // To support Yarn Secure Integration Test Scenario + if (remoteYarnSiteXmlPath != null) { + appMasterEnv.put(YarnConfigKeys.ENV_YARN_SITE_XML_PATH, remoteYarnSiteXmlPath.toString()); + } + if (remoteKrb5Path != null) { + appMasterEnv.put(YarnConfigKeys.ENV_KRB5_PATH, remoteKrb5Path.toString()); + } + + // set classpath from YARN configuration + Utils.setupYarnClassPath(yarnConfiguration, appMasterEnv); + + amContainer.setEnvironment(appMasterEnv); + + // Set up resource type requirements for ApplicationMaster + Resource capability = Records.newRecord(Resource.class); + capability.setMemory(clusterSpecification.getMasterMemoryMB()); + capability.setVirtualCores(flinkConfiguration.getInteger(YarnConfigOptions.APP_MASTER_VCORES)); + + final String customApplicationName = customName != null ? customName : applicationName; + + appContext.setApplicationName(customApplicationName); + appContext.setApplicationType(applicationType != null ? applicationType : "Apache Flink"); + appContext.setAMContainerSpec(amContainer); + appContext.setResource(capability); + + // Set priority for application + int priorityNum = flinkConfiguration.getInteger(YarnConfigOptions.APPLICATION_PRIORITY); + if (priorityNum >= 0) { + Priority priority = Priority.newInstance(priorityNum); + appContext.setPriority(priority); + } + + if (yarnQueue != null) { + appContext.setQueue(yarnQueue); + } + + setApplicationNodeLabel(appContext); + + setApplicationTags(appContext); + + // add a hook to clean up in case deployment fails + Thread deploymentFailureHook = new DeploymentFailureHook(yarnApplication, fileUploader.getApplicationDir()); + Runtime.getRuntime().addShutdownHook(deploymentFailureHook); + LOG.info("Submitting application master " + appId); + yarnClient.submitApplication(appContext); + + LOG.info("Waiting for the cluster to be allocated"); + final long startTime = System.currentTimeMillis(); + ApplicationReport report; + YarnApplicationState lastAppState = YarnApplicationState.NEW; + loop: + while (true) { + try { + report = yarnClient.getApplicationReport(appId); + } catch (IOException e) { + throw new YarnDeploymentException("Failed to deploy the cluster.", e); + } + YarnApplicationState appState = report.getYarnApplicationState(); + LOG.debug("Application State: {}", appState); + switch (appState) { + case FAILED: + case KILLED: + throw new YarnDeploymentException("The YARN application unexpectedly switched to state " + + appState + + " during deployment. \n" + + "Diagnostics from YARN: " + + report.getDiagnostics() + + "\n" + + "If log aggregation is enabled on your cluster, use this command to further investigate the issue:\n" + + "yarn logs -applicationId " + + appId); + // break .. + case RUNNING: + LOG.info("YARN application has been deployed successfully."); + break loop; + case FINISHED: + LOG.info("YARN application has been finished successfully."); + break loop; + default: + if (appState != lastAppState) { + LOG.info("Deploying cluster, current state " + appState); + } + if (System.currentTimeMillis() - startTime > 60000) { + LOG.info( + "Deployment took more than 60 seconds. Please check if the requested resources are available in the YARN cluster"); + } + } + lastAppState = appState; + Thread.sleep(250); + } + + // since deployment was successful, remove the hook + ShutdownHookUtil.removeShutdownHook(deploymentFailureHook, getClass().getSimpleName(), LOG); + return report; + } + + /** + * Returns the configured remote target home directory if set, otherwise returns the default + * home directory. + * + * @param fileSystem file system used + * @return the remote target home directory + */ + private Path getStagingDir(FileSystem fileSystem) { + final String configuredStagingDir = flinkConfiguration.getString(YarnConfigOptions.STAGING_DIRECTORY); + return configuredStagingDir != null + ? fileSystem.makeQualified(new Path(configuredStagingDir)) + : fileSystem.getHomeDirectory(); + } + + private int getFileReplication() { + final int yarnFileReplication = + yarnConfiguration.getInt(DFSConfigKeys.DFS_REPLICATION_KEY, DFSConfigKeys.DFS_REPLICATION_DEFAULT); + final int fileReplication = flinkConfiguration.getInteger(YarnConfigOptions.FILE_REPLICATION); + return fileReplication > 0 ? fileReplication : yarnFileReplication; + } + + private static String encodeYarnLocalResourceDescriptorListToString(List resources) { + return String.join( + LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR, + resources.stream().map(YarnLocalResourceDescriptor::toString).collect(Collectors.toList())); + } + + /** + * Kills YARN application and stops YARN client. + * + *

Use this method to kill the App before it has been properly deployed + */ + private void failSessionDuringDeployment(YarnClient yarnClient, YarnClientApplication yarnApplication) { + LOG.info("Killing YARN application"); + + try { + yarnClient.killApplication( + yarnApplication.getNewApplicationResponse().getApplicationId()); + } catch (Exception e) { + // we only log a debug message here because the "killApplication" call is a best-effort + // call (we don't know if the application has been deployed when the error occurred). + LOG.debug("Error while killing YARN application", e); + } + } + + private static class ClusterResourceDescription { + public final int totalFreeMemory; + public final int containerLimit; + public final int[] nodeManagersFree; + + public ClusterResourceDescription(int totalFreeMemory, int containerLimit, int[] nodeManagersFree) { + this.totalFreeMemory = totalFreeMemory; + this.containerLimit = containerLimit; + this.nodeManagersFree = nodeManagersFree; + } + } + + private ClusterResourceDescription getCurrentFreeClusterResources(YarnClient yarnClient) + throws YarnException, IOException { + List nodes = yarnClient.getNodeReports(NodeState.RUNNING); + + int totalFreeMemory = 0; + int containerLimit = 0; + int[] nodeManagersFree = new int[nodes.size()]; + + for (int i = 0; i < nodes.size(); i++) { + NodeReport rep = nodes.get(i); + int free = rep.getCapability().getMemory() + - (rep.getUsed() != null ? rep.getUsed().getMemory() : 0); + nodeManagersFree[i] = free; + totalFreeMemory += free; + if (free > containerLimit) { + containerLimit = free; + } + } + return new ClusterResourceDescription(totalFreeMemory, containerLimit, nodeManagersFree); + } + + @Override + public String getClusterDescription() { + + try { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + PrintStream ps = new PrintStream(baos); + + YarnClusterMetrics metrics = yarnClient.getYarnClusterMetrics(); + + ps.append("NodeManagers in the ClusterClient " + metrics.getNumNodeManagers()); + List nodes = yarnClient.getNodeReports(NodeState.RUNNING); + final String format = "|%-16s |%-16s %n"; + ps.printf("|Property |Value %n"); + ps.println("+---------------------------------------+"); + int totalMemory = 0; + int totalCores = 0; + for (NodeReport rep : nodes) { + final Resource res = rep.getCapability(); + totalMemory += res.getMemory(); + totalCores += res.getVirtualCores(); + ps.format(format, "NodeID", rep.getNodeId()); + ps.format(format, "Memory", res.getMemory() + " MB"); + ps.format(format, "vCores", res.getVirtualCores()); + ps.format(format, "HealthReport", rep.getHealthReport()); + ps.format(format, "Containers", rep.getNumContainers()); + ps.println("+---------------------------------------+"); + } + ps.println("Summary: totalMemory " + totalMemory + " totalCores " + totalCores); + List qInfo = yarnClient.getAllQueues(); + for (QueueInfo q : qInfo) { + ps.println("Queue: " + + q.getQueueName() + + ", Current Capacity: " + + q.getCurrentCapacity() + + " Max Capacity: " + + q.getMaximumCapacity() + + " Applications: " + + q.getApplications().size()); + } + return baos.toString(); + } catch (Exception e) { + throw new RuntimeException("Couldn't get cluster description", e); + } + } + + private void activateHighAvailabilitySupport(ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + + reflector.setKeepContainersAcrossApplicationAttempts(appContext, true); + + reflector.setAttemptFailuresValidityInterval( + appContext, + flinkConfiguration.getLong(YarnConfigOptions.APPLICATION_ATTEMPT_FAILURE_VALIDITY_INTERVAL)); + } + + private void setApplicationTags(final ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + final ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + final String tagsString = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_TAGS); + + final Set applicationTags = new HashSet<>(); + + // Trim whitespace and cull empty tags + for (final String tag : tagsString.split(",")) { + final String trimmedTag = tag.trim(); + if (!trimmedTag.isEmpty()) { + applicationTags.add(trimmedTag); + } + } + + reflector.setApplicationTags(appContext, applicationTags); + } + + private void setApplicationNodeLabel(final ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + if (nodeLabel != null) { + final ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + reflector.setApplicationNodeLabel(appContext, nodeLabel); + } + } + + /** + * Singleton object which uses reflection to determine whether the {@link + * ApplicationSubmissionContext} supports various methods which, depending on the Hadoop + * version, may or may not be supported. + * + *

If an unsupported method is invoked, nothing happens. + * + *

Currently three methods are proxied: - setApplicationTags (>= 2.4.0) - + * setAttemptFailuresValidityInterval (>= 2.6.0) - setKeepContainersAcrossApplicationAttempts + * (>= 2.4.0) - setNodeLabelExpression (>= 2.6.0) + */ + private static class ApplicationSubmissionContextReflector { + private static final Logger LOG = LoggerFactory.getLogger(ApplicationSubmissionContextReflector.class); + + private static final ApplicationSubmissionContextReflector instance = + new ApplicationSubmissionContextReflector(ApplicationSubmissionContext.class); + + public static ApplicationSubmissionContextReflector getInstance() { + return instance; + } + + private static final String APPLICATION_TAGS_METHOD_NAME = "setApplicationTags"; + private static final String ATTEMPT_FAILURES_METHOD_NAME = "setAttemptFailuresValidityInterval"; + private static final String KEEP_CONTAINERS_METHOD_NAME = "setKeepContainersAcrossApplicationAttempts"; + private static final String NODE_LABEL_EXPRESSION_NAME = "setNodeLabelExpression"; + + private final Method applicationTagsMethod; + private final Method attemptFailuresValidityIntervalMethod; + private final Method keepContainersMethod; + + @Nullable + private final Method nodeLabelExpressionMethod; + + private ApplicationSubmissionContextReflector(Class clazz) { + Method applicationTagsMethod; + Method attemptFailuresValidityIntervalMethod; + Method keepContainersMethod; + Method nodeLabelExpressionMethod; + + try { + // this method is only supported by Hadoop 2.4.0 onwards + applicationTagsMethod = clazz.getMethod(APPLICATION_TAGS_METHOD_NAME, Set.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), APPLICATION_TAGS_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), APPLICATION_TAGS_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + applicationTagsMethod = null; + } + + this.applicationTagsMethod = applicationTagsMethod; + + try { + // this method is only supported by Hadoop 2.6.0 onwards + attemptFailuresValidityIntervalMethod = clazz.getMethod(ATTEMPT_FAILURES_METHOD_NAME, long.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), ATTEMPT_FAILURES_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), ATTEMPT_FAILURES_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + attemptFailuresValidityIntervalMethod = null; + } + + this.attemptFailuresValidityIntervalMethod = attemptFailuresValidityIntervalMethod; + + try { + // this method is only supported by Hadoop 2.4.0 onwards + keepContainersMethod = clazz.getMethod(KEEP_CONTAINERS_METHOD_NAME, boolean.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), KEEP_CONTAINERS_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), KEEP_CONTAINERS_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + keepContainersMethod = null; + } + + this.keepContainersMethod = keepContainersMethod; + + try { + nodeLabelExpressionMethod = clazz.getMethod(NODE_LABEL_EXPRESSION_NAME, String.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), NODE_LABEL_EXPRESSION_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), NODE_LABEL_EXPRESSION_NAME); + nodeLabelExpressionMethod = null; + } + + this.nodeLabelExpressionMethod = nodeLabelExpressionMethod; + } + + public void setApplicationTags(ApplicationSubmissionContext appContext, Set applicationTags) + throws InvocationTargetException, IllegalAccessException { + if (applicationTagsMethod != null) { + LOG.debug( + "Calling method {} of {}.", + applicationTagsMethod.getName(), + appContext.getClass().getCanonicalName()); + applicationTagsMethod.invoke(appContext, applicationTags); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + APPLICATION_TAGS_METHOD_NAME); + } + } + + public void setApplicationNodeLabel(ApplicationSubmissionContext appContext, String nodeLabel) + throws InvocationTargetException, IllegalAccessException { + if (nodeLabelExpressionMethod != null) { + LOG.debug( + "Calling method {} of {}.", + nodeLabelExpressionMethod.getName(), + appContext.getClass().getCanonicalName()); + nodeLabelExpressionMethod.invoke(appContext, nodeLabel); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + NODE_LABEL_EXPRESSION_NAME); + } + } + + public void setAttemptFailuresValidityInterval(ApplicationSubmissionContext appContext, long validityInterval) + throws InvocationTargetException, IllegalAccessException { + if (attemptFailuresValidityIntervalMethod != null) { + LOG.debug( + "Calling method {} of {}.", + attemptFailuresValidityIntervalMethod.getName(), + appContext.getClass().getCanonicalName()); + attemptFailuresValidityIntervalMethod.invoke(appContext, validityInterval); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + ATTEMPT_FAILURES_METHOD_NAME); + } + } + + public void setKeepContainersAcrossApplicationAttempts( + ApplicationSubmissionContext appContext, boolean keepContainers) + throws InvocationTargetException, IllegalAccessException { + + if (keepContainersMethod != null) { + LOG.debug( + "Calling method {} of {}.", + keepContainersMethod.getName(), + appContext.getClass().getCanonicalName()); + keepContainersMethod.invoke(appContext, keepContainers); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + KEEP_CONTAINERS_METHOD_NAME); + } + } + } + + private static class YarnDeploymentException extends RuntimeException { + private static final long serialVersionUID = -812040641215388943L; + + public YarnDeploymentException(String message) { + super(message); + } + + public YarnDeploymentException(String message, Throwable cause) { + super(message, cause); + } + } + + private class DeploymentFailureHook extends Thread { + + private final YarnClient yarnClient; + private final YarnClientApplication yarnApplication; + private final Path yarnFilesDir; + + DeploymentFailureHook(YarnClientApplication yarnApplication, Path yarnFilesDir) { + this.yarnApplication = Preconditions.checkNotNull(yarnApplication); + this.yarnFilesDir = Preconditions.checkNotNull(yarnFilesDir); + + // A new yarn client need to be created in shutdown hook in order to avoid + // the yarn client has been closed by YarnClusterDescriptor. + this.yarnClient = YarnClient.createYarnClient(); + this.yarnClient.init(yarnConfiguration); + } + + @Override + public void run() { + LOG.info("Cancelling deployment from Deployment Failure Hook"); + yarnClient.start(); + failSessionDuringDeployment(yarnClient, yarnApplication); + yarnClient.stop(); + LOG.info("Deleting files in {}.", yarnFilesDir); + try { + FileSystem fs = FileSystem.get(yarnConfiguration); + + if (!fs.delete(yarnFilesDir, true)) { + throw new IOException("Deleting files in " + yarnFilesDir + " was unsuccessful"); + } + + fs.close(); + } catch (IOException e) { + LOG.error("Failed to delete Flink Jar and configuration files in HDFS", e); + } + } + } + + @VisibleForTesting + void addLibFoldersToShipFiles(Collection effectiveShipFiles) { + // Add lib folder to the ship files if the environment variable is set. + // This is for convenience when running from the command-line. + // (for other files users explicitly set the ship files) + String libDir = System.getenv().get(ENV_FLINK_LIB_DIR); + if (libDir != null) { + File directoryFile = new File(libDir); + if (directoryFile.isDirectory()) { + effectiveShipFiles.add(directoryFile); + } else { + throw new YarnDeploymentException("The environment variable '" + + ENV_FLINK_LIB_DIR + + "' is set to '" + + libDir + + "' but the directory doesn't exist."); + } + } else if (shipFiles.isEmpty()) { + LOG.warn( + "Environment variable '{}' not set and ship files have not been provided manually. " + + "Not shipping any library files.", + ENV_FLINK_LIB_DIR); + } + } + + @VisibleForTesting + void addPluginsFoldersToShipFiles(Collection effectiveShipFiles) { + final Optional pluginsDir = PluginConfig.getPluginsDir(); + pluginsDir.ifPresent(effectiveShipFiles::add); + } + + ContainerLaunchContext setupApplicationMasterContainer( + String yarnClusterEntrypoint, boolean hasKrb5, JobManagerProcessSpec processSpec) { + // ------------------ Prepare Application Master Container ------------------------------ + + // respect custom JVM options in the YAML file + String javaOpts = flinkConfiguration.getString(CoreOptions.FLINK_JVM_OPTIONS); + if (flinkConfiguration.getString(CoreOptions.FLINK_JM_JVM_OPTIONS).length() > 0) { + javaOpts += " " + flinkConfiguration.getString(CoreOptions.FLINK_JM_JVM_OPTIONS); + } + + // krb5.conf file will be available as local resource in JM/TM container + if (hasKrb5) { + javaOpts += " -Djava.security.krb5.conf=krb5.conf"; + } + + // Set up the container launch context for the application master + ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); + + final Map startCommandValues = new HashMap<>(); + startCommandValues.put("java", "$JAVA_HOME/bin/java"); + + String jvmHeapMem = JobManagerProcessUtils.generateJvmParametersStr(processSpec, flinkConfiguration); + startCommandValues.put("jvmmem", jvmHeapMem); + + startCommandValues.put("jvmopts", javaOpts); + startCommandValues.put("logging", YarnLogConfigUtil.getLoggingYarnCommand(flinkConfiguration)); + + startCommandValues.put("class", yarnClusterEntrypoint); + startCommandValues.put( + "redirects", + "1> " + + ApplicationConstants.LOG_DIR_EXPANSION_VAR + + "/jobmanager.out " + + "2> " + + ApplicationConstants.LOG_DIR_EXPANSION_VAR + + "/jobmanager.err"); + String dynamicParameterListStr = JobManagerProcessUtils.generateDynamicConfigsStr(processSpec); + startCommandValues.put("args", dynamicParameterListStr); + + final String commandTemplate = flinkConfiguration.getString( + ConfigConstants.YARN_CONTAINER_START_COMMAND_TEMPLATE, + ConfigConstants.DEFAULT_YARN_CONTAINER_START_COMMAND_TEMPLATE); + final String amCommand = BootstrapTools.getStartCommand(commandTemplate, startCommandValues); + + amContainer.setCommands(Collections.singletonList(amCommand)); + + LOG.debug("Application Master start command: " + amCommand); + + return amContainer; + } + + private static YarnConfigOptions.UserJarInclusion getUserJarInclusionMode( + org.apache.flink.configuration.Configuration config) { + return config.get(YarnConfigOptions.CLASSPATH_INCLUDE_USER_JAR); + } + + private static boolean isUsrLibDirIncludedInShipFiles(List shipFiles) { + return shipFiles.stream() + .filter(File::isDirectory) + .map(File::getName) + .noneMatch(name -> name.equals(DEFAULT_FLINK_USR_LIB_DIR)); + } + + private void setClusterEntrypointInfoToConfig(final ApplicationReport report) { + checkNotNull(report); + + final ApplicationId appId = report.getApplicationId(); + final String host = report.getHost(); + final int port = report.getRpcPort(); + + LOG.info("Found Web Interface {}:{} of application '{}'.", host, port, appId); + + flinkConfiguration.setString(JobManagerOptions.ADDRESS, host); + flinkConfiguration.setInteger(JobManagerOptions.PORT, port); + + flinkConfiguration.setString(RestOptions.ADDRESS, host); + flinkConfiguration.setInteger(RestOptions.PORT, port); + + flinkConfiguration.set(YarnConfigOptions.APPLICATION_ID, ConverterUtils.toString(appId)); + + setHAClusterIdIfNotSet(flinkConfiguration, appId); + } + + private void setHAClusterIdIfNotSet(Configuration configuration, ApplicationId appId) { + // set cluster-id to app id if not specified + if (!configuration.contains(HighAvailabilityOptions.HA_CLUSTER_ID)) { + configuration.set(HighAvailabilityOptions.HA_CLUSTER_ID, ConverterUtils.toString(appId)); + } + } + + public static void logDetachedClusterInformation(ApplicationId yarnApplicationId, Logger logger) { + logger.info( + "The Flink YARN session cluster has been started in detached mode. In order to " + + "stop Flink gracefully, use the following command:\n" + + "$ echo \"stop\" | ./bin/yarn-session.sh -id {}\n" + + "If this should not be possible, then you can also kill Flink via YARN's web interface or via:\n" + + "$ yarn application -kill {}\n" + + "Note that killing Flink might not clean up all job artifacts and temporary files.", + yarnApplicationId, + yarnApplicationId); + } +} diff --git a/dinky-client/dinky-client-1.14/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java b/dinky-client/dinky-client-1.14/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java index c83347c9c8..e64efaa8f7 100644 --- a/dinky-client/dinky-client-1.14/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java +++ b/dinky-client/dinky-client-1.14/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java @@ -22,6 +22,7 @@ import org.dinky.assertion.Asserts; import org.dinky.data.model.LineageRel; import org.dinky.data.result.SqlExplainResult; +import org.dinky.utils.JsonUtils; import org.dinky.utils.LineageContext; import org.apache.flink.api.common.RuntimeExecutionMode; @@ -73,6 +74,7 @@ import java.io.File; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; +import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -81,18 +83,19 @@ import java.util.Optional; import java.util.stream.Collectors; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ObjectNode; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.util.ReflectUtil; +import cn.hutool.core.util.URLUtil; +import lombok.extern.slf4j.Slf4j; /** * 定制TableEnvironmentImpl * * @since 2021/10/22 10:02 */ +@Slf4j public class CustomTableEnvironmentImpl extends AbstractCustomTableEnvironment { public CustomTableEnvironmentImpl( @@ -212,21 +215,26 @@ public ObjectNode getStreamGraph(String statement) { ((DefaultExecutor) executor).getExecutionEnvironment().generateStreamGraph(trans); JSONGenerator jsonGenerator = new JSONGenerator(streamGraph); String json = jsonGenerator.getJSON(); - ObjectMapper mapper = new ObjectMapper(); - ObjectNode objectNode = mapper.createObjectNode(); - try { - objectNode = (ObjectNode) mapper.readTree(json); - } catch (JsonProcessingException e) { - e.printStackTrace(); - } finally { - return objectNode; - } + return JsonUtils.parseObject(json); } else { throw new TableException("Unsupported SQL query! explainSql() need a single SQL to query."); } } } + @Override + public void addJar(File... jarPath) { + Configuration configuration = new Configuration(this.getRootConfiguration()); + List pathList = + Arrays.stream(URLUtil.getURLs(jarPath)).map(URL::toString).collect(Collectors.toList()); + List jars = configuration.get(PipelineOptions.JARS); + if (jars == null) { + configuration.set(PipelineOptions.JARS, pathList); + } else { + CollUtil.addAll(jars, pathList); + } + } + @Override public JobPlanInfo getJobPlanInfo(List statements) { return new JobPlanInfo(JsonPlanGenerator.generatePlan(getJobGraphFromInserts(statements))); @@ -374,19 +382,6 @@ public void executeCTAS(Operation operation) { } } - @Override - public void addJar(File... jarPath) { - Configuration configuration = this.getRootConfiguration(); - List jars = configuration.get(PipelineOptions.JARS); - if (jars == null) { - configuration.set( - PipelineOptions.JARS, - Arrays.stream(jarPath).map(File::getAbsolutePath).collect(Collectors.toList())); - } else { - CollUtil.addAll(jars, jarPath); - } - } - @Override public void createTemporaryView(String path, DataStream dataStream, Expression... fields) { createTemporaryView(path, fromDataStream(dataStream, fields)); diff --git a/dinky-client/dinky-client-1.15/pom.xml b/dinky-client/dinky-client-1.15/pom.xml index 003f87ff90..dd0d6d3306 100644 --- a/dinky-client/dinky-client-1.15/pom.xml +++ b/dinky-client/dinky-client-1.15/pom.xml @@ -36,10 +36,6 @@ dinky-client-base ${project.version} - - org.dinky - dinky-common - org.dinky dinky-flink-1.15 diff --git a/dinky-client/dinky-client-1.15/src/main/java/org/apache/flink/yarn/Utils.java b/dinky-client/dinky-client-1.15/src/main/java/org/apache/flink/yarn/Utils.java new file mode 100644 index 0000000000..0598def897 --- /dev/null +++ b/dinky-client/dinky-client-1.15/src/main/java/org/apache/flink/yarn/Utils.java @@ -0,0 +1,658 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.flink.yarn; + +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.yarn.YarnConfigKeys.ENV_FLINK_CLASSPATH; +import static org.apache.flink.yarn.YarnConfigKeys.LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.ConfigUtils; +import org.apache.flink.runtime.clusterframework.BootstrapTools; +import org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters; +import org.apache.flink.runtime.util.HadoopUtils; +import org.apache.flink.util.FlinkException; +import org.apache.flink.util.StringUtils; +import org.apache.flink.util.function.FunctionWithException; +import org.apache.flink.yarn.configuration.YarnConfigOptions; +import org.apache.flink.yarn.configuration.YarnResourceManagerDriverConfiguration; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.mapreduce.security.TokenCache; +import org.apache.hadoop.security.Credentials; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.security.token.TokenIdentifier; +import org.apache.hadoop.util.StringInterner; +import org.apache.hadoop.yarn.api.ApplicationConstants; +import org.apache.hadoop.yarn.api.ApplicationConstants.Environment; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.LocalResource; +import org.apache.hadoop.yarn.api.records.LocalResourceType; +import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.security.AMRMTokenIdentifier; +import org.apache.hadoop.yarn.util.ConverterUtils; +import org.apache.hadoop.yarn.util.Records; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Stream; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import cn.hutool.core.util.StrUtil; + +/** Utility class that provides helper methods to work with Apache Hadoop YARN. */ +public final class Utils { + + private static final Logger LOG = LoggerFactory.getLogger(Utils.class); + + /** KRB5 file name populated in YARN container for secure IT run. */ + public static final String KRB5_FILE_NAME = "krb5.conf"; + + /** Yarn site xml file name populated in YARN container for secure IT run. */ + public static final String YARN_SITE_FILE_NAME = "yarn-site.xml"; + + /** The prefixes that Flink adds to the YARN config. */ + private static final String[] FLINK_CONFIG_PREFIXES = {"flink.yarn."}; + + @VisibleForTesting + static final String YARN_RM_FAIR_SCHEDULER_CLAZZ = + "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler"; + + @VisibleForTesting + static final String YARN_RM_SLS_FAIR_SCHEDULER_CLAZZ = "org.apache.hadoop.yarn.sls.scheduler.SLSFairScheduler"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_MB_KEY = "yarn.resource-types.memory-mb.increment-allocation"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_MB_LEGACY_KEY = "yarn.scheduler.increment-allocation-mb"; + + private static final int DEFAULT_YARN_RM_INCREMENT_ALLOCATION_MB = 1024; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_VCORES_KEY = "yarn.resource-types.vcores.increment-allocation"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_VCORES_LEGACY_KEY = "yarn.scheduler.increment-allocation-vcores"; + + private static final int DEFAULT_YARN_RM_INCREMENT_ALLOCATION_VCORES = 1; + + public static void setupYarnClassPath(Configuration conf, Map appMasterEnv) { + addToEnvironment(appMasterEnv, Environment.CLASSPATH.name(), appMasterEnv.get(ENV_FLINK_CLASSPATH)); + String[] applicationClassPathEntries = conf.getStrings( + YarnConfiguration.YARN_APPLICATION_CLASSPATH, + Stream.of(YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH) + .map(x -> StrUtil.removeAll(x, "%")) + .map(x -> "$".equals(StrUtil.subPre(x, 1)) ? x : "$" + x) + .toArray(String[]::new)); + for (String c : applicationClassPathEntries) { + addToEnvironment(appMasterEnv, Environment.CLASSPATH.name(), c.trim()); + } + } + + /** + * Deletes the YARN application files, e.g., Flink binaries, libraries, etc., from the remote + * filesystem. + * + * @param applicationFilesDir The application files directory. + */ + public static void deleteApplicationFiles(final String applicationFilesDir) { + if (!StringUtils.isNullOrWhitespaceOnly(applicationFilesDir)) { + final org.apache.flink.core.fs.Path path = new org.apache.flink.core.fs.Path(applicationFilesDir); + try { + final org.apache.flink.core.fs.FileSystem fileSystem = path.getFileSystem(); + if (!fileSystem.delete(path, true)) { + LOG.error("Deleting yarn application files under {} was unsuccessful.", applicationFilesDir); + } + } catch (final IOException e) { + LOG.error("Could not properly delete yarn application files directory {}.", applicationFilesDir, e); + } + } else { + LOG.debug("No yarn application files directory set. Therefore, cannot clean up the data."); + } + } + + /** + * Creates a YARN resource for the remote object at the given location. + * + * @param remoteRsrcPath remote location of the resource + * @param resourceSize size of the resource + * @param resourceModificationTime last modification time of the resource + * @return YARN resource + */ + static LocalResource registerLocalResource( + Path remoteRsrcPath, + long resourceSize, + long resourceModificationTime, + LocalResourceVisibility resourceVisibility, + LocalResourceType resourceType) { + LocalResource localResource = Records.newRecord(LocalResource.class); + localResource.setResource(ConverterUtils.getYarnUrlFromURI(remoteRsrcPath.toUri())); + localResource.setSize(resourceSize); + localResource.setTimestamp(resourceModificationTime); + localResource.setType(resourceType); + localResource.setVisibility(resourceVisibility); + return localResource; + } + + /** + * Creates a YARN resource for the remote object at the given location. + * + * @param fs remote filesystem + * @param remoteRsrcPath resource path to be registered + * @return YARN resource + */ + private static LocalResource registerLocalResource( + FileSystem fs, Path remoteRsrcPath, LocalResourceType resourceType) throws IOException { + FileStatus jarStat = fs.getFileStatus(remoteRsrcPath); + return registerLocalResource( + remoteRsrcPath, + jarStat.getLen(), + jarStat.getModificationTime(), + LocalResourceVisibility.APPLICATION, + resourceType); + } + + public static void setTokensFor( + ContainerLaunchContext amContainer, List paths, Configuration conf, boolean obtainingDelegationTokens) + throws IOException { + Credentials credentials = new Credentials(); + + if (obtainingDelegationTokens) { + LOG.info("Obtaining delegation tokens for HDFS and HBase."); + // for HDFS + TokenCache.obtainTokensForNamenodes(credentials, paths.toArray(new Path[0]), conf); + // for HBase + obtainTokenForHBase(credentials, conf); + } else { + LOG.info("Delegation token retrieval for HDFS and HBase is disabled."); + } + + // for user + UserGroupInformation currUsr = UserGroupInformation.getCurrentUser(); + + Collection> usrTok = currUsr.getTokens(); + for (Token token : usrTok) { + LOG.info("Adding user token " + token.getService() + " with " + token); + credentials.addToken(token.getService(), token); + } + try (DataOutputBuffer dob = new DataOutputBuffer()) { + credentials.writeTokenStorageToStream(dob); + + if (LOG.isDebugEnabled()) { + LOG.debug("Wrote tokens. Credentials buffer length: " + dob.getLength()); + } + + ByteBuffer securityTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); + amContainer.setTokens(securityTokens); + } + } + + /** Obtain Kerberos security token for HBase. */ + private static void obtainTokenForHBase(Credentials credentials, Configuration conf) throws IOException { + if (UserGroupInformation.isSecurityEnabled()) { + LOG.info("Attempting to obtain Kerberos security token for HBase"); + try { + // ---- + // Intended call: HBaseConfiguration.addHbaseResources(conf); + Class.forName("org.apache.hadoop.hbase.HBaseConfiguration") + .getMethod("addHbaseResources", Configuration.class) + .invoke(null, conf); + // ---- + + LOG.info("HBase security setting: {}", conf.get("hbase.security.authentication")); + + if (!"kerberos".equals(conf.get("hbase.security.authentication"))) { + LOG.info("HBase has not been configured to use Kerberos."); + return; + } + + Token token; + try { + LOG.info("Obtaining Kerberos security token for HBase"); + // ---- + // Intended call: Token token = + // TokenUtil.obtainToken(conf); + token = (Token) Class.forName("org.apache.hadoop.hbase.security.token.TokenUtil") + .getMethod("obtainToken", Configuration.class) + .invoke(null, conf); + // ---- + } catch (NoSuchMethodException e) { + // for HBase 2 + + // ---- + // Intended call: ConnectionFactory connectionFactory = + // ConnectionFactory.createConnection(conf); + Closeable connectionFactory = + (Closeable) Class.forName("org.apache.hadoop.hbase.client.ConnectionFactory") + .getMethod("createConnection", Configuration.class) + .invoke(null, conf); + // ---- + Class connectionClass = Class.forName("org.apache.hadoop.hbase.client.Connection"); + // ---- + // Intended call: Token token = + // TokenUtil.obtainToken(connectionFactory); + token = (Token) Class.forName("org.apache.hadoop.hbase.security.token.TokenUtil") + .getMethod("obtainToken", connectionClass) + .invoke(null, connectionFactory); + // ---- + if (null != connectionFactory) { + connectionFactory.close(); + } + } + + if (token == null) { + LOG.error("No Kerberos security token for HBase available"); + return; + } + + credentials.addToken(token.getService(), token); + LOG.info("Added HBase Kerberos security token to credentials."); + } catch (ClassNotFoundException + | NoSuchMethodException + | IllegalAccessException + | InvocationTargetException e) { + LOG.info( + "HBase is not available (not packaged with this application): {} : \"{}\".", + e.getClass().getSimpleName(), + e.getMessage()); + } + } + } + + /** + * Copied method from org.apache.hadoop.yarn.util.Apps. It was broken by YARN-1824 (2.4.0) and + * fixed for 2.4.1 by https://issues.apache.org/jira/browse/YARN-1931 + */ + public static void addToEnvironment(Map environment, String variable, String value) { + String val = environment.get(variable); + if (val == null) { + val = value; + } else { + val = val + YarnClusterDescriptor.pathSeparator + value; + } + environment.put(StringInterner.weakIntern(variable), StringInterner.weakIntern(val)); + } + + /** + * Resolve keytab path either as absolute path or relative to working directory. + * + * @param workingDir current working directory + * @param keytabPath configured keytab path. + * @return resolved keytab path, or null if not found. + */ + public static String resolveKeytabPath(String workingDir, String keytabPath) { + String keytab = null; + if (keytabPath != null) { + File f; + f = new File(keytabPath); + if (f.exists()) { + keytab = f.getAbsolutePath(); + LOG.info("Resolved keytab path: {}", keytab); + } else { + // try using relative paths, this is the case when the keytab was shipped + // as a local resource + f = new File(workingDir, keytabPath); + if (f.exists()) { + keytab = f.getAbsolutePath(); + LOG.info("Resolved keytab path: {}", keytab); + } else { + LOG.warn("Could not resolve keytab path with: {}", keytabPath); + keytab = null; + } + } + } + return keytab; + } + + /** Private constructor to prevent instantiation. */ + private Utils() { + throw new RuntimeException(); + } + + /** + * Creates the launch context, which describes how to bring up a TaskExecutor / TaskManager + * process in an allocated YARN container. + * + *

This code is extremely YARN specific and registers all the resources that the TaskExecutor + * needs (such as JAR file, config file, ...) and all environment variables in a YARN container + * launch context. The launch context then ensures that those resources will be copied into the + * containers transient working directory. + * + * @param flinkConfig The Flink configuration object. + * @param yarnConfig The YARN configuration object. + * @param configuration The YarnResourceManagerDriver configurations. + * @param tmParams The TaskExecutor container memory parameters. + * @param taskManagerDynamicProperties The dynamic configurations to be updated for the + * TaskExecutors based on client uploaded Flink config. + * @param workingDirectory The current application master container's working directory. + * @param taskManagerMainClass The class with the main method. + * @param log The logger. + * @return The launch context for the TaskManager processes. + * @throws Exception Thrown if the launch context could not be created, for example if the + * resources could not be copied. + */ + static ContainerLaunchContext createTaskExecutorContext( + org.apache.flink.configuration.Configuration flinkConfig, + YarnConfiguration yarnConfig, + YarnResourceManagerDriverConfiguration configuration, + ContaineredTaskManagerParameters tmParams, + String taskManagerDynamicProperties, + String workingDirectory, + Class taskManagerMainClass, + Logger log) + throws Exception { + + // get and validate all relevant variables + + String remoteFlinkJarPath = checkNotNull( + configuration.getFlinkDistJar(), "Environment variable %s not set", YarnConfigKeys.FLINK_DIST_JAR); + + String shipListString = checkNotNull( + configuration.getClientShipFiles(), + "Environment variable %s not set", + YarnConfigKeys.ENV_CLIENT_SHIP_FILES); + + final String remoteKeytabPath = configuration.getRemoteKeytabPath(); + final String localKeytabPath = configuration.getLocalKeytabPath(); + final String keytabPrincipal = configuration.getKeytabPrinciple(); + final String remoteYarnConfPath = configuration.getYarnSiteXMLPath(); + final String remoteKrb5Path = configuration.getKrb5Path(); + + if (log.isDebugEnabled()) { + log.debug("TM:remote keytab path obtained {}", remoteKeytabPath); + log.debug("TM:local keytab path obtained {}", localKeytabPath); + log.debug("TM:keytab principal obtained {}", keytabPrincipal); + log.debug("TM:remote yarn conf path obtained {}", remoteYarnConfPath); + log.debug("TM:remote krb5 path obtained {}", remoteKrb5Path); + } + + String classPathString = checkNotNull( + configuration.getFlinkClasspath(), + "Environment variable %s not set", + YarnConfigKeys.ENV_FLINK_CLASSPATH); + + // register keytab + LocalResource keytabResource = null; + if (remoteKeytabPath != null) { + log.info("TM:Adding keytab {} to the container local resource bucket", remoteKeytabPath); + Path keytabPath = new Path(remoteKeytabPath); + FileSystem fs = keytabPath.getFileSystem(yarnConfig); + keytabResource = registerLocalResource(fs, keytabPath, LocalResourceType.FILE); + } + + // To support Yarn Secure Integration Test Scenario + LocalResource yarnConfResource = null; + if (remoteYarnConfPath != null) { + log.info("TM:Adding remoteYarnConfPath {} to the container local resource bucket", remoteYarnConfPath); + Path yarnConfPath = new Path(remoteYarnConfPath); + FileSystem fs = yarnConfPath.getFileSystem(yarnConfig); + yarnConfResource = registerLocalResource(fs, yarnConfPath, LocalResourceType.FILE); + } + + // register krb5.conf + LocalResource krb5ConfResource = null; + boolean hasKrb5 = false; + if (remoteKrb5Path != null) { + log.info("Adding remoteKrb5Path {} to the container local resource bucket", remoteKrb5Path); + Path krb5ConfPath = new Path(remoteKrb5Path); + FileSystem fs = krb5ConfPath.getFileSystem(yarnConfig); + krb5ConfResource = registerLocalResource(fs, krb5ConfPath, LocalResourceType.FILE); + hasKrb5 = true; + } + + Map taskManagerLocalResources = new HashMap<>(); + + // register Flink Jar with remote HDFS + final YarnLocalResourceDescriptor flinkDistLocalResourceDesc = + YarnLocalResourceDescriptor.fromString(remoteFlinkJarPath); + taskManagerLocalResources.put( + flinkDistLocalResourceDesc.getResourceKey(), flinkDistLocalResourceDesc.toLocalResource()); + + // To support Yarn Secure Integration Test Scenario + if (yarnConfResource != null) { + taskManagerLocalResources.put(YARN_SITE_FILE_NAME, yarnConfResource); + } + if (krb5ConfResource != null) { + taskManagerLocalResources.put(KRB5_FILE_NAME, krb5ConfResource); + } + if (keytabResource != null) { + taskManagerLocalResources.put(localKeytabPath, keytabResource); + } + + // prepare additional files to be shipped + decodeYarnLocalResourceDescriptorListFromString(shipListString) + .forEach(resourceDesc -> + taskManagerLocalResources.put(resourceDesc.getResourceKey(), resourceDesc.toLocalResource())); + + // now that all resources are prepared, we can create the launch context + + log.info("Creating container launch context for TaskManagers"); + + boolean hasLogback = new File(workingDirectory, "logback.xml").exists(); + boolean hasLog4j = new File(workingDirectory, "log4j.properties").exists(); + + String launchCommand = BootstrapTools.getTaskManagerShellCommand( + flinkConfig, + tmParams, + ".", + ApplicationConstants.LOG_DIR_EXPANSION_VAR, + hasLogback, + hasLog4j, + hasKrb5, + taskManagerMainClass, + taskManagerDynamicProperties); + + if (log.isDebugEnabled()) { + log.debug("Starting TaskManagers with command: " + launchCommand); + } else { + log.info("Starting TaskManagers"); + } + + ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class); + ctx.setCommands(Collections.singletonList(launchCommand)); + ctx.setLocalResources(taskManagerLocalResources); + + Map containerEnv = new HashMap<>(); + containerEnv.putAll(tmParams.taskManagerEnv()); + + // add YARN classpath, etc to the container environment + containerEnv.put(ENV_FLINK_CLASSPATH, classPathString); + setupYarnClassPath(yarnConfig, containerEnv); + + containerEnv.put( + YarnConfigKeys.ENV_HADOOP_USER_NAME, + UserGroupInformation.getCurrentUser().getUserName()); + + if (remoteKeytabPath != null && localKeytabPath != null && keytabPrincipal != null) { + containerEnv.put(YarnConfigKeys.REMOTE_KEYTAB_PATH, remoteKeytabPath); + containerEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localKeytabPath); + containerEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, keytabPrincipal); + } else if (localKeytabPath != null && keytabPrincipal != null) { + containerEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localKeytabPath); + containerEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, keytabPrincipal); + } + + ctx.setEnvironment(containerEnv); + + // For TaskManager YARN container context, read the tokens from the jobmanager yarn + // container local file. + // NOTE: must read the tokens from the local file, not from the UGI context, because if UGI + // is login + // using Kerberos keytabs, there is no HDFS delegation token in the UGI context. + final String fileLocation = System.getenv(UserGroupInformation.HADOOP_TOKEN_FILE_LOCATION); + + if (fileLocation != null) { + log.debug("Adding security tokens to TaskExecutor's container launch context."); + + try (DataOutputBuffer dob = new DataOutputBuffer()) { + Credentials cred = Credentials.readTokenStorageFile( + new File(fileLocation), HadoopUtils.getHadoopConfiguration(flinkConfig)); + + // Filter out AMRMToken before setting the tokens to the TaskManager container + // context. + Credentials taskManagerCred = new Credentials(); + Collection> userTokens = cred.getAllTokens(); + for (Token token : userTokens) { + if (!token.getKind().equals(AMRMTokenIdentifier.KIND_NAME)) { + taskManagerCred.addToken(token.getService(), token); + } + } + + taskManagerCred.writeTokenStorageToStream(dob); + ByteBuffer securityTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); + ctx.setTokens(securityTokens); + } catch (Throwable t) { + log.error("Failed to add Hadoop's security tokens.", t); + } + } else { + log.info("Could not set security tokens because Hadoop's token file location is unknown."); + } + + return ctx; + } + + static boolean isRemotePath(String path) throws IOException { + org.apache.flink.core.fs.Path flinkPath = new org.apache.flink.core.fs.Path(path); + return flinkPath.getFileSystem().isDistributedFS(); + } + + private static List decodeYarnLocalResourceDescriptorListFromString(String resources) + throws Exception { + final List resourceDescriptors = new ArrayList<>(); + for (String shipResourceDescStr : resources.split(LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR)) { + if (!shipResourceDescStr.isEmpty()) { + resourceDescriptors.add(YarnLocalResourceDescriptor.fromString(shipResourceDescStr)); + } + } + return resourceDescriptors; + } + + @VisibleForTesting + static Resource getUnitResource(YarnConfiguration yarnConfig) { + final int unitMemMB, unitVcore; + + final String yarnRmSchedulerClazzName = yarnConfig.get(YarnConfiguration.RM_SCHEDULER); + if (Objects.equals(yarnRmSchedulerClazzName, YARN_RM_FAIR_SCHEDULER_CLAZZ) + || Objects.equals(yarnRmSchedulerClazzName, YARN_RM_SLS_FAIR_SCHEDULER_CLAZZ)) { + String propMem = yarnConfig.get(YARN_RM_INCREMENT_ALLOCATION_MB_KEY); + String propVcore = yarnConfig.get(YARN_RM_INCREMENT_ALLOCATION_VCORES_KEY); + + unitMemMB = propMem != null + ? Integer.parseInt(propMem) + : yarnConfig.getInt( + YARN_RM_INCREMENT_ALLOCATION_MB_LEGACY_KEY, DEFAULT_YARN_RM_INCREMENT_ALLOCATION_MB); + unitVcore = propVcore != null + ? Integer.parseInt(propVcore) + : yarnConfig.getInt( + YARN_RM_INCREMENT_ALLOCATION_VCORES_LEGACY_KEY, + DEFAULT_YARN_RM_INCREMENT_ALLOCATION_VCORES); + } else { + unitMemMB = yarnConfig.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB); + unitVcore = yarnConfig.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES); + } + + return Resource.newInstance(unitMemMB, unitVcore); + } + + public static List getQualifiedRemoteSharedPaths( + org.apache.flink.configuration.Configuration configuration, YarnConfiguration yarnConfiguration) + throws IOException, FlinkException { + + return getRemoteSharedPaths(configuration, pathStr -> { + final Path path = new Path(pathStr); + return path.getFileSystem(yarnConfiguration).makeQualified(path); + }); + } + + private static List getRemoteSharedPaths( + org.apache.flink.configuration.Configuration configuration, + FunctionWithException strToPathMapper) + throws IOException, FlinkException { + + final List providedLibDirs = + ConfigUtils.decodeListFromConfig(configuration, YarnConfigOptions.PROVIDED_LIB_DIRS, strToPathMapper); + + for (Path path : providedLibDirs) { + if (!Utils.isRemotePath(path.toString())) { + throw new FlinkException("The \"" + + YarnConfigOptions.PROVIDED_LIB_DIRS.key() + + "\" should only contain" + + " dirs accessible from all worker nodes, while the \"" + + path + + "\" is local."); + } + } + return providedLibDirs; + } + + public static YarnConfiguration getYarnAndHadoopConfiguration( + org.apache.flink.configuration.Configuration flinkConfig) { + final YarnConfiguration yarnConfig = getYarnConfiguration(flinkConfig); + yarnConfig.addResource(HadoopUtils.getHadoopConfiguration(flinkConfig)); + + return yarnConfig; + } + + /** + * Add additional config entries from the flink config to the yarn config. + * + * @param flinkConfig The Flink configuration object. + * @return The yarn configuration. + */ + public static YarnConfiguration getYarnConfiguration(org.apache.flink.configuration.Configuration flinkConfig) { + final YarnConfiguration yarnConfig = new YarnConfiguration(); + + for (String key : flinkConfig.keySet()) { + for (String prefix : FLINK_CONFIG_PREFIXES) { + if (key.startsWith(prefix)) { + String newKey = key.substring("flink.".length()); + String value = flinkConfig.getString(key, null); + yarnConfig.set(newKey, value); + LOG.debug("Adding Flink config entry for {} as {}={} to Yarn config", key, newKey, value); + } + } + } + + return yarnConfig; + } +} diff --git a/dinky-client/dinky-client-1.15/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java b/dinky-client/dinky-client-1.15/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java new file mode 100644 index 0000000000..07d7da98cc --- /dev/null +++ b/dinky-client/dinky-client-1.15/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java @@ -0,0 +1,1694 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.flink.yarn; + +import static org.apache.flink.client.deployment.application.ApplicationConfiguration.APPLICATION_MAIN_CLASS; +import static org.apache.flink.configuration.ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR; +import static org.apache.flink.configuration.ConfigConstants.ENV_FLINK_LIB_DIR; +import static org.apache.flink.configuration.ConfigConstants.ENV_FLINK_OPT_DIR; +import static org.apache.flink.runtime.entrypoint.component.FileJobGraphRetriever.JOB_GRAPH_FILE_PATH; +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.yarn.YarnConfigKeys.LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.cache.DistributedCache; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.client.deployment.ClusterDeploymentException; +import org.apache.flink.client.deployment.ClusterDescriptor; +import org.apache.flink.client.deployment.ClusterRetrieveException; +import org.apache.flink.client.deployment.ClusterSpecification; +import org.apache.flink.client.deployment.application.ApplicationConfiguration; +import org.apache.flink.client.program.ClusterClientProvider; +import org.apache.flink.client.program.PackagedProgramUtils; +import org.apache.flink.client.program.rest.RestClusterClient; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigUtils; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ConfigurationUtils; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.configuration.HighAvailabilityOptions; +import org.apache.flink.configuration.IllegalConfigurationException; +import org.apache.flink.configuration.JobManagerOptions; +import org.apache.flink.configuration.PipelineOptions; +import org.apache.flink.configuration.ResourceManagerOptions; +import org.apache.flink.configuration.RestOptions; +import org.apache.flink.configuration.SecurityOptions; +import org.apache.flink.configuration.TaskManagerOptions; +import org.apache.flink.core.plugin.PluginConfig; +import org.apache.flink.core.plugin.PluginUtils; +import org.apache.flink.runtime.clusterframework.BootstrapTools; +import org.apache.flink.runtime.entrypoint.ClusterEntrypoint; +import org.apache.flink.runtime.entrypoint.ClusterEntrypointUtils; +import org.apache.flink.runtime.jobgraph.JobGraph; +import org.apache.flink.runtime.jobmanager.HighAvailabilityMode; +import org.apache.flink.runtime.jobmanager.JobManagerProcessSpec; +import org.apache.flink.runtime.jobmanager.JobManagerProcessUtils; +import org.apache.flink.runtime.util.HadoopUtils; +import org.apache.flink.util.CollectionUtil; +import org.apache.flink.util.FlinkException; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.ShutdownHookUtil; +import org.apache.flink.util.StringUtils; +import org.apache.flink.yarn.configuration.YarnConfigOptions; +import org.apache.flink.yarn.configuration.YarnConfigOptionsInternal; +import org.apache.flink.yarn.configuration.YarnDeploymentTarget; +import org.apache.flink.yarn.configuration.YarnLogConfigUtil; +import org.apache.flink.yarn.entrypoint.YarnApplicationClusterEntryPoint; +import org.apache.flink.yarn.entrypoint.YarnJobClusterEntrypoint; +import org.apache.flink.yarn.entrypoint.YarnSessionClusterEntrypoint; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.yarn.api.ApplicationConstants; +import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.ApplicationReport; +import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; +import org.apache.hadoop.yarn.api.records.LocalResourceType; +import org.apache.hadoop.yarn.api.records.NodeReport; +import org.apache.hadoop.yarn.api.records.NodeState; +import org.apache.hadoop.yarn.api.records.Priority; +import org.apache.hadoop.yarn.api.records.QueueInfo; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.YarnApplicationState; +import org.apache.hadoop.yarn.api.records.YarnClusterMetrics; +import org.apache.hadoop.yarn.client.api.YarnClient; +import org.apache.hadoop.yarn.client.api.YarnClientApplication; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.util.ConverterUtils; +import org.apache.hadoop.yarn.util.Records; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.PrintStream; +import java.io.UnsupportedEncodingException; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.net.URI; +import java.net.URLDecoder; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** The descriptor with deployment information for deploying a Flink cluster on Yarn. */ +public class YarnClusterDescriptor implements ClusterDescriptor { + private static final Logger LOG = LoggerFactory.getLogger(YarnClusterDescriptor.class); + public static final String pathSeparator = ":"; + + private final YarnConfiguration yarnConfiguration; + + private final YarnClient yarnClient; + + private final YarnClusterInformationRetriever yarnClusterInformationRetriever; + + /** True if the descriptor must not shut down the YarnClient. */ + private final boolean sharedYarnClient; + + /** Lazily initialized list of files to ship. */ + private final List shipFiles = new LinkedList<>(); + + private final List shipArchives = new LinkedList<>(); + + private final String yarnQueue; + + private Path flinkJarPath; + + private final Configuration flinkConfiguration; + + private final String customName; + + private final String nodeLabel; + + private final String applicationType; + + private YarnConfigOptions.UserJarInclusion userJarInclusion; + + public YarnClusterDescriptor( + Configuration flinkConfiguration, + YarnConfiguration yarnConfiguration, + YarnClient yarnClient, + YarnClusterInformationRetriever yarnClusterInformationRetriever, + boolean sharedYarnClient) { + + this.yarnConfiguration = Preconditions.checkNotNull(yarnConfiguration); + this.yarnClient = Preconditions.checkNotNull(yarnClient); + this.yarnClusterInformationRetriever = Preconditions.checkNotNull(yarnClusterInformationRetriever); + this.sharedYarnClient = sharedYarnClient; + + this.flinkConfiguration = Preconditions.checkNotNull(flinkConfiguration); + this.userJarInclusion = getUserJarInclusionMode(flinkConfiguration); + + getLocalFlinkDistPath(flinkConfiguration).ifPresent(this::setLocalJarPath); + decodeFilesToShipToCluster(flinkConfiguration, YarnConfigOptions.SHIP_FILES) + .ifPresent(this::addShipFiles); + decodeFilesToShipToCluster(flinkConfiguration, YarnConfigOptions.SHIP_ARCHIVES) + .ifPresent(this::addShipArchives); + + this.yarnQueue = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_QUEUE); + this.customName = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_NAME); + this.applicationType = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_TYPE); + this.nodeLabel = flinkConfiguration.getString(YarnConfigOptions.NODE_LABEL); + } + + private Optional> decodeFilesToShipToCluster( + final Configuration configuration, final ConfigOption> configOption) { + checkNotNull(configuration); + checkNotNull(configOption); + + final List files = ConfigUtils.decodeListFromConfig(configuration, configOption, File::new); + return files.isEmpty() ? Optional.empty() : Optional.of(files); + } + + private Optional getLocalFlinkDistPath(final Configuration configuration) { + final String localJarPath = configuration.getString(YarnConfigOptions.FLINK_DIST_JAR); + if (localJarPath != null) { + return Optional.of(new Path(localJarPath)); + } + + LOG.info("No path for the flink jar passed. Using the location of " + getClass() + " to locate the jar"); + + // check whether it's actually a jar file --> when testing we execute this class without a + // flink-dist jar + final String decodedPath = getDecodedJarPath(); + return decodedPath.endsWith(".jar") ? Optional.of(new Path(new File(decodedPath).toURI())) : Optional.empty(); + } + + private String getDecodedJarPath() { + final String encodedJarPath = YarnClusterClientFactory.class + .getProtectionDomain() + .getCodeSource() + .getLocation() + .getPath(); + try { + return URLDecoder.decode(encodedJarPath, Charset.defaultCharset().name()); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException("Couldn't decode the encoded Flink dist jar path: " + + encodedJarPath + + " You can supply a path manually via the command line."); + } + } + + @VisibleForTesting + List getShipFiles() { + return shipFiles; + } + + public YarnClient getYarnClient() { + return yarnClient; + } + + /** + * The class to start the application master with. This class runs the main method in case of + * session cluster. + */ + protected String getYarnSessionClusterEntrypoint() { + return YarnSessionClusterEntrypoint.class.getName(); + } + + /** + * The class to start the application master with. This class runs the main method in case of + * the job cluster. + */ + protected String getYarnJobClusterEntrypoint() { + return YarnJobClusterEntrypoint.class.getName(); + } + + public Configuration getFlinkConfiguration() { + return flinkConfiguration; + } + + public void setLocalJarPath(Path localJarPath) { + if (!localJarPath.toString().endsWith("jar")) { + throw new IllegalArgumentException( + "The passed jar path ('" + localJarPath + "') does not end with the 'jar' extension"); + } + this.flinkJarPath = localJarPath; + } + + /** + * Adds the given files to the list of files to ship. + * + *

Note that any file matching "flink-dist*.jar" will be excluded from the upload by + * {@link YarnApplicationFileUploader#registerMultipleLocalResources(Collection, String, + * LocalResourceType)} since we upload the Flink uber jar ourselves and do not need to deploy it + * multiple times. + * + * @param shipFiles files to ship + */ + public void addShipFiles(List shipFiles) { + checkArgument( + !isUsrLibDirIncludedInShipFiles(shipFiles), + "User-shipped directories configured via : %s should not include %s.", + YarnConfigOptions.SHIP_FILES.key(), + ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR); + this.shipFiles.addAll(shipFiles); + } + + private void addShipArchives(List shipArchives) { + checkArgument(isArchiveOnlyIncludedInShipArchiveFiles(shipArchives), "Non-archive files are included."); + this.shipArchives.addAll(shipArchives); + } + + private static boolean isArchiveOnlyIncludedInShipArchiveFiles(List shipFiles) { + return shipFiles.stream() + .filter(File::isFile) + .map(File::getName) + .map(String::toLowerCase) + .allMatch(name -> name.endsWith(".tar.gz") + || name.endsWith(".tar") + || name.endsWith(".tgz") + || name.endsWith(".dst") + || name.endsWith(".jar") + || name.endsWith(".zip")); + } + + private void isReadyForDeployment(ClusterSpecification clusterSpecification) throws Exception { + + if (this.flinkJarPath == null) { + throw new YarnDeploymentException("The Flink jar path is null"); + } + if (this.flinkConfiguration == null) { + throw new YarnDeploymentException("Flink configuration object has not been set"); + } + + // Check if we don't exceed YARN's maximum virtual cores. + final int numYarnMaxVcores = yarnClusterInformationRetriever.getMaxVcores(); + + int configuredAmVcores = flinkConfiguration.getInteger(YarnConfigOptions.APP_MASTER_VCORES); + if (configuredAmVcores > numYarnMaxVcores) { + throw new IllegalConfigurationException(String.format( + "The number of requested virtual cores for application master %d" + + " exceeds the maximum number of virtual cores %d available in the Yarn Cluster.", + configuredAmVcores, numYarnMaxVcores)); + } + + int configuredVcores = + flinkConfiguration.getInteger(YarnConfigOptions.VCORES, clusterSpecification.getSlotsPerTaskManager()); + // don't configure more than the maximum configured number of vcores + if (configuredVcores > numYarnMaxVcores) { + throw new IllegalConfigurationException(String.format( + "The number of requested virtual cores per node %d" + + " exceeds the maximum number of virtual cores %d available in the Yarn Cluster." + + " Please note that the number of virtual cores is set to the number of task slots by default" + + " unless configured in the Flink config with '%s.'", + configuredVcores, numYarnMaxVcores, YarnConfigOptions.VCORES.key())); + } + + // check if required Hadoop environment variables are set. If not, warn user + if (System.getenv("HADOOP_CONF_DIR") == null && System.getenv("YARN_CONF_DIR") == null) { + LOG.warn("Neither the HADOOP_CONF_DIR nor the YARN_CONF_DIR environment variable is set. " + + "The Flink YARN Client needs one of these to be set to properly load the Hadoop " + + "configuration for accessing YARN."); + } + } + + public String getNodeLabel() { + return nodeLabel; + } + + // ------------------------------------------------------------- + // Lifecycle management + // ------------------------------------------------------------- + + @Override + public void close() { + if (!sharedYarnClient) { + yarnClient.stop(); + } + } + + // ------------------------------------------------------------- + // ClusterClient overrides + // ------------------------------------------------------------- + + @Override + public ClusterClientProvider retrieve(ApplicationId applicationId) throws ClusterRetrieveException { + + try { + // check if required Hadoop environment variables are set. If not, warn user + if (System.getenv("HADOOP_CONF_DIR") == null && System.getenv("YARN_CONF_DIR") == null) { + LOG.warn("Neither the HADOOP_CONF_DIR nor the YARN_CONF_DIR environment variable is set." + + "The Flink YARN Client needs one of these to be set to properly load the Hadoop " + + "configuration for accessing YARN."); + } + + final ApplicationReport report = yarnClient.getApplicationReport(applicationId); + + if (report.getFinalApplicationStatus() != FinalApplicationStatus.UNDEFINED) { + // Flink cluster is not running anymore + LOG.error( + "The application {} doesn't run anymore. It has previously completed with final status: {}", + applicationId, + report.getFinalApplicationStatus()); + throw new RuntimeException("The Yarn application " + applicationId + " doesn't run anymore."); + } + + setClusterEntrypointInfoToConfig(report); + + return () -> { + try { + return new RestClusterClient<>(flinkConfiguration, report.getApplicationId()); + } catch (Exception e) { + throw new RuntimeException("Couldn't retrieve Yarn cluster", e); + } + }; + } catch (Exception e) { + throw new ClusterRetrieveException("Couldn't retrieve Yarn cluster", e); + } + } + + @Override + public ClusterClientProvider deploySessionCluster(ClusterSpecification clusterSpecification) + throws ClusterDeploymentException { + try { + return deployInternal( + clusterSpecification, "Flink session cluster", getYarnSessionClusterEntrypoint(), null, false); + } catch (Exception e) { + throw new ClusterDeploymentException("Couldn't deploy Yarn session cluster", e); + } + } + + @Override + public ClusterClientProvider deployApplicationCluster( + final ClusterSpecification clusterSpecification, final ApplicationConfiguration applicationConfiguration) + throws ClusterDeploymentException { + checkNotNull(clusterSpecification); + checkNotNull(applicationConfiguration); + + final YarnDeploymentTarget deploymentTarget = YarnDeploymentTarget.fromConfig(flinkConfiguration); + if (YarnDeploymentTarget.APPLICATION != deploymentTarget) { + throw new ClusterDeploymentException("Couldn't deploy Yarn Application Cluster." + + " Expected deployment.target=" + + YarnDeploymentTarget.APPLICATION.getName() + + " but actual one was \"" + + deploymentTarget.getName() + + "\""); + } + + applicationConfiguration.applyToConfiguration(flinkConfiguration); + + // No need to do pipelineJars validation if it is a PyFlink job. + if (!(PackagedProgramUtils.isPython(applicationConfiguration.getApplicationClassName()) + || PackagedProgramUtils.isPython(applicationConfiguration.getProgramArguments()))) { + final List pipelineJars = + flinkConfiguration.getOptional(PipelineOptions.JARS).orElse(Collections.emptyList()); + Preconditions.checkArgument(pipelineJars.size() == 1, "Should only have one jar"); + } + + try { + return deployInternal( + clusterSpecification, + "Flink Application Cluster", + YarnApplicationClusterEntryPoint.class.getName(), + null, + false); + } catch (Exception e) { + throw new ClusterDeploymentException("Couldn't deploy Yarn Application Cluster", e); + } + } + + @Override + public ClusterClientProvider deployJobCluster( + ClusterSpecification clusterSpecification, JobGraph jobGraph, boolean detached) + throws ClusterDeploymentException { + + LOG.warn( + "Job Clusters are deprecated since Flink 1.15. Please use an Application Cluster/Application Mode instead."); + try { + return deployInternal( + clusterSpecification, "Flink per-job cluster", getYarnJobClusterEntrypoint(), jobGraph, detached); + } catch (Exception e) { + throw new ClusterDeploymentException("Could not deploy Yarn job cluster.", e); + } + } + + @Override + public void killCluster(ApplicationId applicationId) throws FlinkException { + try { + yarnClient.killApplication(applicationId); + + try (final FileSystem fs = FileSystem.get(yarnConfiguration)) { + final Path applicationDir = + YarnApplicationFileUploader.getApplicationDirPath(getStagingDir(fs), applicationId); + + Utils.deleteApplicationFiles(applicationDir.toUri().toString()); + } + + } catch (YarnException | IOException e) { + throw new FlinkException("Could not kill the Yarn Flink cluster with id " + applicationId + '.', e); + } + } + + /** + * This method will block until the ApplicationMaster/JobManager have been deployed on YARN. + * + * @param clusterSpecification Initial cluster specification for the Flink cluster to be + * deployed + * @param applicationName name of the Yarn application to start + * @param yarnClusterEntrypoint Class name of the Yarn cluster entry point. + * @param jobGraph A job graph which is deployed with the Flink cluster, {@code null} if none + * @param detached True if the cluster should be started in detached mode + */ + private ClusterClientProvider deployInternal( + ClusterSpecification clusterSpecification, + String applicationName, + String yarnClusterEntrypoint, + @Nullable JobGraph jobGraph, + boolean detached) + throws Exception { + + final UserGroupInformation currentUser = UserGroupInformation.getCurrentUser(); + if (HadoopUtils.isKerberosSecurityEnabled(currentUser)) { + boolean useTicketCache = flinkConfiguration.getBoolean(SecurityOptions.KERBEROS_LOGIN_USETICKETCACHE); + + if (!HadoopUtils.areKerberosCredentialsValid(currentUser, useTicketCache)) { + throw new RuntimeException("Hadoop security with Kerberos is enabled but the login user " + + "does not have Kerberos credentials or delegation tokens!"); + } + + final boolean fetchToken = flinkConfiguration.getBoolean(SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN); + final boolean yarnAccessFSEnabled = + !CollectionUtil.isNullOrEmpty(flinkConfiguration.get(YarnConfigOptions.YARN_ACCESS)); + if (!fetchToken && yarnAccessFSEnabled) { + throw new IllegalConfigurationException(String.format( + "When %s is disabled, %s must be disabled as well.", + SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN.key(), YarnConfigOptions.YARN_ACCESS.key())); + } + } + + isReadyForDeployment(clusterSpecification); + + // ------------------ Check if the specified queue exists -------------------- + + checkYarnQueues(yarnClient); + + // ------------------ Check if the YARN ClusterClient has the requested resources + // -------------- + + // Create application via yarnClient + final YarnClientApplication yarnApplication = yarnClient.createApplication(); + final GetNewApplicationResponse appResponse = yarnApplication.getNewApplicationResponse(); + + Resource maxRes = appResponse.getMaximumResourceCapability(); + + final ClusterResourceDescription freeClusterMem; + try { + freeClusterMem = getCurrentFreeClusterResources(yarnClient); + } catch (YarnException | IOException e) { + failSessionDuringDeployment(yarnClient, yarnApplication); + throw new YarnDeploymentException("Could not retrieve information about free cluster resources.", e); + } + + final int yarnMinAllocationMB = yarnConfiguration.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB); + if (yarnMinAllocationMB <= 0) { + throw new YarnDeploymentException("The minimum allocation memory " + + "(" + + yarnMinAllocationMB + + " MB) configured via '" + + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB + + "' should be greater than 0."); + } + + final ClusterSpecification validClusterSpecification; + try { + validClusterSpecification = + validateClusterResources(clusterSpecification, yarnMinAllocationMB, maxRes, freeClusterMem); + } catch (YarnDeploymentException yde) { + failSessionDuringDeployment(yarnClient, yarnApplication); + throw yde; + } + + LOG.info("Cluster specification: {}", validClusterSpecification); + + final ClusterEntrypoint.ExecutionMode executionMode = + detached ? ClusterEntrypoint.ExecutionMode.DETACHED : ClusterEntrypoint.ExecutionMode.NORMAL; + + flinkConfiguration.setString(ClusterEntrypoint.INTERNAL_CLUSTER_EXECUTION_MODE, executionMode.toString()); + + ApplicationReport report = startAppMaster( + flinkConfiguration, + applicationName, + yarnClusterEntrypoint, + jobGraph, + yarnClient, + yarnApplication, + validClusterSpecification); + + // print the application id for user to cancel themselves. + if (detached) { + final ApplicationId yarnApplicationId = report.getApplicationId(); + logDetachedClusterInformation(yarnApplicationId, LOG); + } + + setClusterEntrypointInfoToConfig(report); + + return () -> { + try { + return new RestClusterClient<>(flinkConfiguration, report.getApplicationId()); + } catch (Exception e) { + throw new RuntimeException("Error while creating RestClusterClient.", e); + } + }; + } + + private ClusterSpecification validateClusterResources( + ClusterSpecification clusterSpecification, + int yarnMinAllocationMB, + Resource maximumResourceCapability, + ClusterResourceDescription freeClusterResources) + throws YarnDeploymentException { + + int jobManagerMemoryMb = clusterSpecification.getMasterMemoryMB(); + final int taskManagerMemoryMb = clusterSpecification.getTaskManagerMemoryMB(); + + logIfComponentMemNotIntegerMultipleOfYarnMinAllocation("JobManager", jobManagerMemoryMb, yarnMinAllocationMB); + logIfComponentMemNotIntegerMultipleOfYarnMinAllocation("TaskManager", taskManagerMemoryMb, yarnMinAllocationMB); + + // set the memory to minAllocationMB to do the next checks correctly + if (jobManagerMemoryMb < yarnMinAllocationMB) { + jobManagerMemoryMb = yarnMinAllocationMB; + } + + final String note = + "Please check the 'yarn.scheduler.maximum-allocation-mb' and the 'yarn.nodemanager.resource.memory-mb' configuration values\n"; + if (jobManagerMemoryMb > maximumResourceCapability.getMemory()) { + throw new YarnDeploymentException( + "The cluster does not have the requested resources for the JobManager available!\n" + + "Maximum Memory: " + + maximumResourceCapability.getMemory() + + "MB Requested: " + + jobManagerMemoryMb + + "MB. " + + note); + } + + if (taskManagerMemoryMb > maximumResourceCapability.getMemory()) { + throw new YarnDeploymentException( + "The cluster does not have the requested resources for the TaskManagers available!\n" + + "Maximum Memory: " + + maximumResourceCapability.getMemory() + + " Requested: " + + taskManagerMemoryMb + + "MB. " + + note); + } + + final String noteRsc = + "\nThe Flink YARN client will try to allocate the YARN session, but maybe not all TaskManagers are " + + "connecting from the beginning because the resources are currently not available in the cluster. " + + "The allocation might take more time than usual because the Flink YARN client needs to wait until " + + "the resources become available."; + + if (taskManagerMemoryMb > freeClusterResources.containerLimit) { + LOG.warn("The requested amount of memory for the TaskManagers (" + + taskManagerMemoryMb + + "MB) is more than " + + "the largest possible YARN container: " + + freeClusterResources.containerLimit + + noteRsc); + } + if (jobManagerMemoryMb > freeClusterResources.containerLimit) { + LOG.warn("The requested amount of memory for the JobManager (" + + jobManagerMemoryMb + + "MB) is more than " + + "the largest possible YARN container: " + + freeClusterResources.containerLimit + + noteRsc); + } + + return new ClusterSpecification.ClusterSpecificationBuilder() + .setMasterMemoryMB(jobManagerMemoryMb) + .setTaskManagerMemoryMB(taskManagerMemoryMb) + .setSlotsPerTaskManager(clusterSpecification.getSlotsPerTaskManager()) + .createClusterSpecification(); + } + + private void logIfComponentMemNotIntegerMultipleOfYarnMinAllocation( + String componentName, int componentMemoryMB, int yarnMinAllocationMB) { + int normalizedMemMB = + (componentMemoryMB + (yarnMinAllocationMB - 1)) / yarnMinAllocationMB * yarnMinAllocationMB; + if (normalizedMemMB <= 0) { + normalizedMemMB = yarnMinAllocationMB; + } + if (componentMemoryMB != normalizedMemMB) { + LOG.info( + "The configured {} memory is {} MB. YARN will allocate {} MB to make up an integer multiple of its " + + "minimum allocation memory ({} MB, configured via 'yarn.scheduler.minimum-allocation-mb'). The extra {} MB " + + "may not be used by Flink.", + componentName, + componentMemoryMB, + normalizedMemMB, + yarnMinAllocationMB, + normalizedMemMB - componentMemoryMB); + } + } + + private void checkYarnQueues(YarnClient yarnClient) { + try { + List queues = yarnClient.getAllQueues(); + if (queues.size() > 0 + && this.yarnQueue != null) { // check only if there are queues configured in yarn and for + // this session. + boolean queueFound = false; + for (QueueInfo queue : queues) { + if (queue.getQueueName().equals(this.yarnQueue) + || queue.getQueueName().equals("root." + this.yarnQueue)) { + queueFound = true; + break; + } + } + if (!queueFound) { + String queueNames = StringUtils.toQuotedListString(queues.toArray()); + LOG.warn("The specified queue '" + + this.yarnQueue + + "' does not exist. " + + "Available queues: " + + queueNames); + } + } else { + LOG.debug("The YARN cluster does not have any queues configured"); + } + } catch (Throwable e) { + LOG.warn("Error while getting queue information from YARN: " + e.getMessage()); + if (LOG.isDebugEnabled()) { + LOG.debug("Error details", e); + } + } + } + + private ApplicationReport startAppMaster( + Configuration configuration, + String applicationName, + String yarnClusterEntrypoint, + JobGraph jobGraph, + YarnClient yarnClient, + YarnClientApplication yarnApplication, + ClusterSpecification clusterSpecification) + throws Exception { + + // ------------------ Initialize the file systems ------------------------- + + org.apache.flink.core.fs.FileSystem.initialize( + configuration, PluginUtils.createPluginManagerFromRootFolder(configuration)); + + final FileSystem fs = FileSystem.get(yarnConfiguration); + + // hard coded check for the GoogleHDFS client because its not overriding the getScheme() + // method. + if (!fs.getClass().getSimpleName().equals("GoogleHadoopFileSystem") + && fs.getScheme().startsWith("file")) { + LOG.warn("The file system scheme is '" + + fs.getScheme() + + "'. This indicates that the " + + "specified Hadoop configuration path is wrong and the system is using the default Hadoop configuration values." + + "The Flink YARN client needs to store its files in a distributed file system"); + } + + ApplicationSubmissionContext appContext = yarnApplication.getApplicationSubmissionContext(); + + final List providedLibDirs = Utils.getQualifiedRemoteSharedPaths(configuration, yarnConfiguration); + + Path stagingDirPath = getStagingDir(fs); + FileSystem stagingDirFs = stagingDirPath.getFileSystem(yarnConfiguration); + final YarnApplicationFileUploader fileUploader = YarnApplicationFileUploader.from( + stagingDirFs, stagingDirPath, providedLibDirs, appContext.getApplicationId(), getFileReplication()); + + // The files need to be shipped and added to classpath. + Set systemShipFiles = new HashSet<>(shipFiles.size()); + for (File file : shipFiles) { + systemShipFiles.add(file.getAbsoluteFile()); + } + + final String logConfigFilePath = configuration.getString(YarnConfigOptionsInternal.APPLICATION_LOG_CONFIG_FILE); + if (logConfigFilePath != null) { + systemShipFiles.add(new File(logConfigFilePath)); + } + + // Set-up ApplicationSubmissionContext for the application + + final ApplicationId appId = appContext.getApplicationId(); + + // ------------------ Add Zookeeper namespace to local flinkConfiguraton ------ + setHAClusterIdIfNotSet(configuration, appId); + + if (HighAvailabilityMode.isHighAvailabilityModeActivated(configuration)) { + // activate re-execution of failed applications + appContext.setMaxAppAttempts(configuration.getInteger( + YarnConfigOptions.APPLICATION_ATTEMPTS.key(), YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)); + + activateHighAvailabilitySupport(appContext); + } else { + // set number of application retries to 1 in the default case + appContext.setMaxAppAttempts(configuration.getInteger(YarnConfigOptions.APPLICATION_ATTEMPTS.key(), 1)); + } + + final Set userJarFiles = new HashSet<>(); + if (jobGraph != null) { + userJarFiles.addAll(jobGraph.getUserJars().stream() + .map(f -> f.toUri()) + .map(Path::new) + .collect(Collectors.toSet())); + } + + final List jarUrls = ConfigUtils.decodeListFromConfig(configuration, PipelineOptions.JARS, URI::create); + if (jarUrls != null && YarnApplicationClusterEntryPoint.class.getName().equals(yarnClusterEntrypoint)) { + userJarFiles.addAll(jarUrls.stream().map(Path::new).collect(Collectors.toSet())); + } + + // only for per job mode + if (jobGraph != null) { + for (Map.Entry entry : + jobGraph.getUserArtifacts().entrySet()) { + // only upload local files + if (!Utils.isRemotePath(entry.getValue().filePath)) { + Path localPath = new Path(entry.getValue().filePath); + Tuple2 remoteFileInfo = fileUploader.uploadLocalFileToRemote(localPath, entry.getKey()); + jobGraph.setUserArtifactRemotePath(entry.getKey(), remoteFileInfo.f0.toString()); + } + } + + jobGraph.writeUserArtifactEntriesToConfiguration(); + } + + if (providedLibDirs == null || providedLibDirs.isEmpty()) { + addLibFoldersToShipFiles(systemShipFiles); + } + + // Register all files in provided lib dirs as local resources with public visibility + // and upload the remaining dependencies as local resources with APPLICATION visibility. + final List systemClassPaths = fileUploader.registerProvidedLocalResources(); + final List uploadedDependencies = fileUploader.registerMultipleLocalResources( + systemShipFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.FILE); + systemClassPaths.addAll(uploadedDependencies); + + // upload and register ship-only files + // Plugin files only need to be shipped and should not be added to classpath. + if (providedLibDirs == null || providedLibDirs.isEmpty()) { + Set shipOnlyFiles = new HashSet<>(); + addPluginsFoldersToShipFiles(shipOnlyFiles); + fileUploader.registerMultipleLocalResources( + shipOnlyFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.FILE); + } + + if (!shipArchives.isEmpty()) { + fileUploader.registerMultipleLocalResources( + shipArchives.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.ARCHIVE); + } + + // only for application mode + // Python jar file only needs to be shipped and should not be added to classpath. + if (YarnApplicationClusterEntryPoint.class.getName().equals(yarnClusterEntrypoint) + && PackagedProgramUtils.isPython(configuration.get(APPLICATION_MAIN_CLASS))) { + fileUploader.registerMultipleLocalResources( + Collections.singletonList( + new Path(PackagedProgramUtils.getPythonJar().toURI())), + ConfigConstants.DEFAULT_FLINK_OPT_DIR, + LocalResourceType.FILE); + } + + // Upload and register user jars + final List userClassPaths = fileUploader.registerMultipleLocalResources( + userJarFiles, + userJarInclusion == YarnConfigOptions.UserJarInclusion.DISABLED + ? ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR + : Path.CUR_DIR, + LocalResourceType.FILE); + + // usrlib will be automatically shipped if it exists. + if (ClusterEntrypointUtils.tryFindUserLibDirectory().isPresent()) { + final Set usrLibShipFiles = new HashSet<>(); + addUsrLibFolderToShipFiles(usrLibShipFiles); + final List usrLibClassPaths = fileUploader.registerMultipleLocalResources( + usrLibShipFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.FILE); + userClassPaths.addAll(usrLibClassPaths); + } + + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.ORDER) { + systemClassPaths.addAll(userClassPaths); + } + + // normalize classpath by sorting + Collections.sort(systemClassPaths); + Collections.sort(userClassPaths); + + // classpath assembler + StringBuilder classPathBuilder = new StringBuilder(); + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.FIRST) { + for (String userClassPath : userClassPaths) { + classPathBuilder.append(userClassPath).append(pathSeparator); + } + } + for (String classPath : systemClassPaths) { + classPathBuilder.append(classPath).append(pathSeparator); + } + + // Setup jar for ApplicationMaster + final YarnLocalResourceDescriptor localResourceDescFlinkJar = fileUploader.uploadFlinkDist(flinkJarPath); + classPathBuilder.append(localResourceDescFlinkJar.getResourceKey()).append(pathSeparator); + + // write job graph to tmp file and add it to local resource + // TODO: server use user main method to generate job graph + if (jobGraph != null) { + File tmpJobGraphFile = null; + try { + tmpJobGraphFile = File.createTempFile(appId.toString(), null); + try (FileOutputStream output = new FileOutputStream(tmpJobGraphFile); + ObjectOutputStream obOutput = new ObjectOutputStream(output)) { + obOutput.writeObject(jobGraph); + } + + final String jobGraphFilename = "job.graph"; + configuration.setString(JOB_GRAPH_FILE_PATH, jobGraphFilename); + + fileUploader.registerSingleLocalResource( + jobGraphFilename, new Path(tmpJobGraphFile.toURI()), "", LocalResourceType.FILE, true, false); + classPathBuilder.append(jobGraphFilename).append(pathSeparator); + } catch (Exception e) { + LOG.warn("Add job graph to local resource fail."); + throw e; + } finally { + if (tmpJobGraphFile != null && !tmpJobGraphFile.delete()) { + LOG.warn("Fail to delete temporary file {}.", tmpJobGraphFile.toPath()); + } + } + } + + // Upload the flink configuration + // write out configuration file + File tmpConfigurationFile = null; + try { + tmpConfigurationFile = File.createTempFile(appId + "-flink-conf.yaml", null); + + // remove localhost bind hosts as they render production clusters unusable + removeLocalhostBindHostSetting(configuration, JobManagerOptions.BIND_HOST); + removeLocalhostBindHostSetting(configuration, TaskManagerOptions.BIND_HOST); + // this setting is unconditionally overridden anyway, so we remove it for clarity + configuration.removeConfig(TaskManagerOptions.HOST); + + BootstrapTools.writeConfiguration(configuration, tmpConfigurationFile); + + String flinkConfigKey = "flink-conf.yaml"; + fileUploader.registerSingleLocalResource( + flinkConfigKey, + new Path(tmpConfigurationFile.getAbsolutePath()), + "", + LocalResourceType.FILE, + true, + true); + classPathBuilder.append("flink-conf.yaml").append(pathSeparator); + } finally { + if (tmpConfigurationFile != null && !tmpConfigurationFile.delete()) { + LOG.warn("Fail to delete temporary file {}.", tmpConfigurationFile.toPath()); + } + } + + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.LAST) { + for (String userClassPath : userClassPaths) { + classPathBuilder.append(userClassPath).append(pathSeparator); + } + } + + // To support Yarn Secure Integration Test Scenario + // In Integration test setup, the Yarn containers created by YarnMiniCluster does not have + // the Yarn site XML + // and KRB5 configuration files. We are adding these files as container local resources for + // the container + // applications (JM/TMs) to have proper secure cluster setup + Path remoteYarnSiteXmlPath = null; + if (System.getenv("IN_TESTS") != null) { + File f = new File(System.getenv("YARN_CONF_DIR"), Utils.YARN_SITE_FILE_NAME); + LOG.info("Adding Yarn configuration {} to the AM container local resource bucket", f.getAbsolutePath()); + Path yarnSitePath = new Path(f.getAbsolutePath()); + remoteYarnSiteXmlPath = fileUploader + .registerSingleLocalResource( + Utils.YARN_SITE_FILE_NAME, yarnSitePath, "", LocalResourceType.FILE, false, false) + .getPath(); + if (System.getProperty("java.security.krb5.conf") != null) { + configuration.set(SecurityOptions.KERBEROS_KRB5_PATH, System.getProperty("java.security.krb5.conf")); + } + } + + Path remoteKrb5Path = null; + boolean hasKrb5 = false; + String krb5Config = configuration.get(SecurityOptions.KERBEROS_KRB5_PATH); + if (!StringUtils.isNullOrWhitespaceOnly(krb5Config)) { + final File krb5 = new File(krb5Config); + LOG.info("Adding KRB5 configuration {} to the AM container local resource bucket", krb5.getAbsolutePath()); + final Path krb5ConfPath = new Path(krb5.getAbsolutePath()); + remoteKrb5Path = fileUploader + .registerSingleLocalResource( + Utils.KRB5_FILE_NAME, krb5ConfPath, "", LocalResourceType.FILE, false, false) + .getPath(); + hasKrb5 = true; + } + + Path remotePathKeytab = null; + String localizedKeytabPath = null; + String keytab = configuration.getString(SecurityOptions.KERBEROS_LOGIN_KEYTAB); + if (keytab != null) { + boolean localizeKeytab = flinkConfiguration.getBoolean(YarnConfigOptions.SHIP_LOCAL_KEYTAB); + localizedKeytabPath = flinkConfiguration.getString(YarnConfigOptions.LOCALIZED_KEYTAB_PATH); + if (localizeKeytab) { + // Localize the keytab to YARN containers via local resource. + LOG.info("Adding keytab {} to the AM container local resource bucket", keytab); + remotePathKeytab = fileUploader + .registerSingleLocalResource( + localizedKeytabPath, new Path(keytab), "", LocalResourceType.FILE, false, false) + .getPath(); + } else { + // // Assume Keytab is pre-installed in the container. + localizedKeytabPath = flinkConfiguration.getString(YarnConfigOptions.LOCALIZED_KEYTAB_PATH); + } + } + + final JobManagerProcessSpec processSpec = + JobManagerProcessUtils.processSpecFromConfigWithNewOptionToInterpretLegacyHeap( + flinkConfiguration, JobManagerOptions.TOTAL_PROCESS_MEMORY); + final ContainerLaunchContext amContainer = + setupApplicationMasterContainer(yarnClusterEntrypoint, hasKrb5, processSpec); + + // setup security tokens + if (UserGroupInformation.isSecurityEnabled()) { + // set HDFS delegation tokens when security is enabled + LOG.info("Adding delegation token to the AM container."); + final List pathsToObtainToken = new ArrayList<>(); + boolean fetchToken = configuration.getBoolean(SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN); + if (fetchToken) { + List yarnAccessList = + ConfigUtils.decodeListFromConfig(configuration, YarnConfigOptions.YARN_ACCESS, Path::new); + pathsToObtainToken.addAll(yarnAccessList); + pathsToObtainToken.addAll(fileUploader.getRemotePaths()); + } + Utils.setTokensFor(amContainer, pathsToObtainToken, yarnConfiguration, fetchToken); + } + + amContainer.setLocalResources(fileUploader.getRegisteredLocalResources()); + fileUploader.close(); + + // Setup CLASSPATH and environment variables for ApplicationMaster + final Map appMasterEnv = new HashMap<>(); + // set user specified app master environment variables + appMasterEnv.putAll(ConfigurationUtils.getPrefixedKeyValuePairs( + ResourceManagerOptions.CONTAINERIZED_MASTER_ENV_PREFIX, configuration)); + // set Flink app class path + appMasterEnv.put(YarnConfigKeys.ENV_FLINK_CLASSPATH, classPathBuilder.toString()); + + // Set FLINK_OPT_DIR to `opt` folder under working dir in container + appMasterEnv.put(ENV_FLINK_OPT_DIR, Path.CUR_DIR + "/" + ConfigConstants.DEFAULT_FLINK_OPT_DIR); + + // set Flink on YARN internal configuration values + appMasterEnv.put(YarnConfigKeys.FLINK_DIST_JAR, localResourceDescFlinkJar.toString()); + appMasterEnv.put(YarnConfigKeys.ENV_APP_ID, appId.toString()); + appMasterEnv.put( + YarnConfigKeys.ENV_CLIENT_HOME_DIR, fileUploader.getHomeDir().toString()); + appMasterEnv.put( + YarnConfigKeys.ENV_CLIENT_SHIP_FILES, + encodeYarnLocalResourceDescriptorListToString(fileUploader.getEnvShipResourceList())); + appMasterEnv.put( + YarnConfigKeys.FLINK_YARN_FILES, + fileUploader.getApplicationDir().toUri().toString()); + + // https://github.com/apache/hadoop/blob/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/YarnApplicationSecurity.md#identity-on-an-insecure-cluster-hadoop_user_name + appMasterEnv.put( + YarnConfigKeys.ENV_HADOOP_USER_NAME, + UserGroupInformation.getCurrentUser().getUserName()); + + if (localizedKeytabPath != null) { + appMasterEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localizedKeytabPath); + String principal = configuration.getString(SecurityOptions.KERBEROS_LOGIN_PRINCIPAL); + appMasterEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, principal); + if (remotePathKeytab != null) { + appMasterEnv.put(YarnConfigKeys.REMOTE_KEYTAB_PATH, remotePathKeytab.toString()); + } + } + + // To support Yarn Secure Integration Test Scenario + if (remoteYarnSiteXmlPath != null) { + appMasterEnv.put(YarnConfigKeys.ENV_YARN_SITE_XML_PATH, remoteYarnSiteXmlPath.toString()); + } + if (remoteKrb5Path != null) { + appMasterEnv.put(YarnConfigKeys.ENV_KRB5_PATH, remoteKrb5Path.toString()); + } + + // set classpath from YARN configuration + Utils.setupYarnClassPath(yarnConfiguration, appMasterEnv); + + amContainer.setEnvironment(appMasterEnv); + + // Set up resource type requirements for ApplicationMaster + Resource capability = Records.newRecord(Resource.class); + capability.setMemory(clusterSpecification.getMasterMemoryMB()); + capability.setVirtualCores(flinkConfiguration.getInteger(YarnConfigOptions.APP_MASTER_VCORES)); + + final String customApplicationName = customName != null ? customName : applicationName; + + appContext.setApplicationName(customApplicationName); + appContext.setApplicationType(applicationType != null ? applicationType : "Apache Flink"); + appContext.setAMContainerSpec(amContainer); + appContext.setResource(capability); + + // Set priority for application + int priorityNum = flinkConfiguration.getInteger(YarnConfigOptions.APPLICATION_PRIORITY); + if (priorityNum >= 0) { + Priority priority = Priority.newInstance(priorityNum); + appContext.setPriority(priority); + } + + if (yarnQueue != null) { + appContext.setQueue(yarnQueue); + } + + setApplicationNodeLabel(appContext); + + setApplicationTags(appContext); + + // add a hook to clean up in case deployment fails + Thread deploymentFailureHook = new DeploymentFailureHook(yarnApplication, fileUploader.getApplicationDir()); + Runtime.getRuntime().addShutdownHook(deploymentFailureHook); + LOG.info("Submitting application master " + appId); + yarnClient.submitApplication(appContext); + + LOG.info("Waiting for the cluster to be allocated"); + final long startTime = System.currentTimeMillis(); + ApplicationReport report; + YarnApplicationState lastAppState = YarnApplicationState.NEW; + loop: + while (true) { + try { + report = yarnClient.getApplicationReport(appId); + } catch (IOException e) { + throw new YarnDeploymentException("Failed to deploy the cluster.", e); + } + YarnApplicationState appState = report.getYarnApplicationState(); + LOG.debug("Application State: {}", appState); + switch (appState) { + case FAILED: + case KILLED: + throw new YarnDeploymentException("The YARN application unexpectedly switched to state " + + appState + + " during deployment. \n" + + "Diagnostics from YARN: " + + report.getDiagnostics() + + "\n" + + "If log aggregation is enabled on your cluster, use this command to further investigate the issue:\n" + + "yarn logs -applicationId " + + appId); + // break .. + case RUNNING: + LOG.info("YARN application has been deployed successfully."); + break loop; + case FINISHED: + LOG.info("YARN application has been finished successfully."); + break loop; + default: + if (appState != lastAppState) { + LOG.info("Deploying cluster, current state " + appState); + } + if (System.currentTimeMillis() - startTime > 60000) { + LOG.info( + "Deployment took more than 60 seconds. Please check if the requested resources are available in the YARN cluster"); + } + } + lastAppState = appState; + Thread.sleep(250); + } + + // since deployment was successful, remove the hook + ShutdownHookUtil.removeShutdownHook(deploymentFailureHook, getClass().getSimpleName(), LOG); + return report; + } + + private void removeLocalhostBindHostSetting(Configuration configuration, ConfigOption option) { + configuration + .getOptional(option) + .filter(bindHost -> bindHost.equals("localhost")) + .ifPresent(bindHost -> { + LOG.info( + "Removing 'localhost' {} setting from effective configuration; using '0.0.0.0' instead.", + option); + configuration.removeConfig(option); + }); + } + + /** + * Returns the configured remote target home directory if set, otherwise returns the default + * home directory. + * + * @param defaultFileSystem default file system used + * @return the remote target home directory + */ + @VisibleForTesting + Path getStagingDir(FileSystem defaultFileSystem) throws IOException { + final String configuredStagingDir = flinkConfiguration.getString(YarnConfigOptions.STAGING_DIRECTORY); + if (configuredStagingDir == null) { + return defaultFileSystem.getHomeDirectory(); + } + FileSystem stagingDirFs = new Path(configuredStagingDir).getFileSystem(defaultFileSystem.getConf()); + return stagingDirFs.makeQualified(new Path(configuredStagingDir)); + } + + private int getFileReplication() { + final int yarnFileReplication = + yarnConfiguration.getInt(DFSConfigKeys.DFS_REPLICATION_KEY, DFSConfigKeys.DFS_REPLICATION_DEFAULT); + final int fileReplication = flinkConfiguration.getInteger(YarnConfigOptions.FILE_REPLICATION); + return fileReplication > 0 ? fileReplication : yarnFileReplication; + } + + private static String encodeYarnLocalResourceDescriptorListToString(List resources) { + return String.join( + LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR, + resources.stream().map(YarnLocalResourceDescriptor::toString).collect(Collectors.toList())); + } + + /** + * Kills YARN application and stops YARN client. + * + *

Use this method to kill the App before it has been properly deployed + */ + private void failSessionDuringDeployment(YarnClient yarnClient, YarnClientApplication yarnApplication) { + LOG.info("Killing YARN application"); + + try { + yarnClient.killApplication( + yarnApplication.getNewApplicationResponse().getApplicationId()); + } catch (Exception e) { + // we only log a debug message here because the "killApplication" call is a best-effort + // call (we don't know if the application has been deployed when the error occurred). + LOG.debug("Error while killing YARN application", e); + } + } + + private static class ClusterResourceDescription { + public final int totalFreeMemory; + public final int containerLimit; + public final int[] nodeManagersFree; + + public ClusterResourceDescription(int totalFreeMemory, int containerLimit, int[] nodeManagersFree) { + this.totalFreeMemory = totalFreeMemory; + this.containerLimit = containerLimit; + this.nodeManagersFree = nodeManagersFree; + } + } + + private ClusterResourceDescription getCurrentFreeClusterResources(YarnClient yarnClient) + throws YarnException, IOException { + List nodes = yarnClient.getNodeReports(NodeState.RUNNING); + + int totalFreeMemory = 0; + int containerLimit = 0; + int[] nodeManagersFree = new int[nodes.size()]; + + for (int i = 0; i < nodes.size(); i++) { + NodeReport rep = nodes.get(i); + int free = rep.getCapability().getMemory() + - (rep.getUsed() != null ? rep.getUsed().getMemory() : 0); + nodeManagersFree[i] = free; + totalFreeMemory += free; + if (free > containerLimit) { + containerLimit = free; + } + } + return new ClusterResourceDescription(totalFreeMemory, containerLimit, nodeManagersFree); + } + + @Override + public String getClusterDescription() { + + try { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + PrintStream ps = new PrintStream(baos); + + YarnClusterMetrics metrics = yarnClient.getYarnClusterMetrics(); + + ps.append("NodeManagers in the ClusterClient " + metrics.getNumNodeManagers()); + List nodes = yarnClient.getNodeReports(NodeState.RUNNING); + final String format = "|%-16s |%-16s %n"; + ps.printf("|Property |Value %n"); + ps.println("+---------------------------------------+"); + int totalMemory = 0; + int totalCores = 0; + for (NodeReport rep : nodes) { + final Resource res = rep.getCapability(); + totalMemory += res.getMemory(); + totalCores += res.getVirtualCores(); + ps.format(format, "NodeID", rep.getNodeId()); + ps.format(format, "Memory", res.getMemory() + " MB"); + ps.format(format, "vCores", res.getVirtualCores()); + ps.format(format, "HealthReport", rep.getHealthReport()); + ps.format(format, "Containers", rep.getNumContainers()); + ps.println("+---------------------------------------+"); + } + ps.println("Summary: totalMemory " + totalMemory + " totalCores " + totalCores); + List qInfo = yarnClient.getAllQueues(); + for (QueueInfo q : qInfo) { + ps.println("Queue: " + + q.getQueueName() + + ", Current Capacity: " + + q.getCurrentCapacity() + + " Max Capacity: " + + q.getMaximumCapacity() + + " Applications: " + + q.getApplications().size()); + } + return baos.toString(); + } catch (Exception e) { + throw new RuntimeException("Couldn't get cluster description", e); + } + } + + private void activateHighAvailabilitySupport(ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + + reflector.setKeepContainersAcrossApplicationAttempts(appContext, true); + + reflector.setAttemptFailuresValidityInterval( + appContext, + flinkConfiguration.getLong(YarnConfigOptions.APPLICATION_ATTEMPT_FAILURE_VALIDITY_INTERVAL)); + } + + private void setApplicationTags(final ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + final ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + final String tagsString = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_TAGS); + + final Set applicationTags = new HashSet<>(); + + // Trim whitespace and cull empty tags + for (final String tag : tagsString.split(",")) { + final String trimmedTag = tag.trim(); + if (!trimmedTag.isEmpty()) { + applicationTags.add(trimmedTag); + } + } + + reflector.setApplicationTags(appContext, applicationTags); + } + + private void setApplicationNodeLabel(final ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + if (nodeLabel != null) { + final ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + reflector.setApplicationNodeLabel(appContext, nodeLabel); + } + } + + /** + * Singleton object which uses reflection to determine whether the {@link + * ApplicationSubmissionContext} supports various methods which, depending on the Hadoop + * version, may or may not be supported. + * + *

If an unsupported method is invoked, nothing happens. + * + *

Currently three methods are proxied: - setApplicationTags (>= 2.4.0) - + * setAttemptFailuresValidityInterval (>= 2.6.0) - setKeepContainersAcrossApplicationAttempts + * (>= 2.4.0) - setNodeLabelExpression (>= 2.6.0) + */ + private static class ApplicationSubmissionContextReflector { + private static final Logger LOG = LoggerFactory.getLogger(ApplicationSubmissionContextReflector.class); + + private static final ApplicationSubmissionContextReflector instance = + new ApplicationSubmissionContextReflector(ApplicationSubmissionContext.class); + + public static ApplicationSubmissionContextReflector getInstance() { + return instance; + } + + private static final String APPLICATION_TAGS_METHOD_NAME = "setApplicationTags"; + private static final String ATTEMPT_FAILURES_METHOD_NAME = "setAttemptFailuresValidityInterval"; + private static final String KEEP_CONTAINERS_METHOD_NAME = "setKeepContainersAcrossApplicationAttempts"; + private static final String NODE_LABEL_EXPRESSION_NAME = "setNodeLabelExpression"; + + private final Method applicationTagsMethod; + private final Method attemptFailuresValidityIntervalMethod; + private final Method keepContainersMethod; + + @Nullable + private final Method nodeLabelExpressionMethod; + + private ApplicationSubmissionContextReflector(Class clazz) { + Method applicationTagsMethod; + Method attemptFailuresValidityIntervalMethod; + Method keepContainersMethod; + Method nodeLabelExpressionMethod; + + try { + // this method is only supported by Hadoop 2.4.0 onwards + applicationTagsMethod = clazz.getMethod(APPLICATION_TAGS_METHOD_NAME, Set.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), APPLICATION_TAGS_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), APPLICATION_TAGS_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + applicationTagsMethod = null; + } + + this.applicationTagsMethod = applicationTagsMethod; + + try { + // this method is only supported by Hadoop 2.6.0 onwards + attemptFailuresValidityIntervalMethod = clazz.getMethod(ATTEMPT_FAILURES_METHOD_NAME, long.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), ATTEMPT_FAILURES_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), ATTEMPT_FAILURES_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + attemptFailuresValidityIntervalMethod = null; + } + + this.attemptFailuresValidityIntervalMethod = attemptFailuresValidityIntervalMethod; + + try { + // this method is only supported by Hadoop 2.4.0 onwards + keepContainersMethod = clazz.getMethod(KEEP_CONTAINERS_METHOD_NAME, boolean.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), KEEP_CONTAINERS_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), KEEP_CONTAINERS_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + keepContainersMethod = null; + } + + this.keepContainersMethod = keepContainersMethod; + + try { + nodeLabelExpressionMethod = clazz.getMethod(NODE_LABEL_EXPRESSION_NAME, String.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), NODE_LABEL_EXPRESSION_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), NODE_LABEL_EXPRESSION_NAME); + nodeLabelExpressionMethod = null; + } + + this.nodeLabelExpressionMethod = nodeLabelExpressionMethod; + } + + public void setApplicationTags(ApplicationSubmissionContext appContext, Set applicationTags) + throws InvocationTargetException, IllegalAccessException { + if (applicationTagsMethod != null) { + LOG.debug( + "Calling method {} of {}.", + applicationTagsMethod.getName(), + appContext.getClass().getCanonicalName()); + applicationTagsMethod.invoke(appContext, applicationTags); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + APPLICATION_TAGS_METHOD_NAME); + } + } + + public void setApplicationNodeLabel(ApplicationSubmissionContext appContext, String nodeLabel) + throws InvocationTargetException, IllegalAccessException { + if (nodeLabelExpressionMethod != null) { + LOG.debug( + "Calling method {} of {}.", + nodeLabelExpressionMethod.getName(), + appContext.getClass().getCanonicalName()); + nodeLabelExpressionMethod.invoke(appContext, nodeLabel); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + NODE_LABEL_EXPRESSION_NAME); + } + } + + public void setAttemptFailuresValidityInterval(ApplicationSubmissionContext appContext, long validityInterval) + throws InvocationTargetException, IllegalAccessException { + if (attemptFailuresValidityIntervalMethod != null) { + LOG.debug( + "Calling method {} of {}.", + attemptFailuresValidityIntervalMethod.getName(), + appContext.getClass().getCanonicalName()); + attemptFailuresValidityIntervalMethod.invoke(appContext, validityInterval); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + ATTEMPT_FAILURES_METHOD_NAME); + } + } + + public void setKeepContainersAcrossApplicationAttempts( + ApplicationSubmissionContext appContext, boolean keepContainers) + throws InvocationTargetException, IllegalAccessException { + + if (keepContainersMethod != null) { + LOG.debug( + "Calling method {} of {}.", + keepContainersMethod.getName(), + appContext.getClass().getCanonicalName()); + keepContainersMethod.invoke(appContext, keepContainers); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + KEEP_CONTAINERS_METHOD_NAME); + } + } + } + + private static class YarnDeploymentException extends RuntimeException { + private static final long serialVersionUID = -812040641215388943L; + + public YarnDeploymentException(String message) { + super(message); + } + + public YarnDeploymentException(String message, Throwable cause) { + super(message, cause); + } + } + + private class DeploymentFailureHook extends Thread { + + private final YarnClient yarnClient; + private final YarnClientApplication yarnApplication; + private final Path yarnFilesDir; + + DeploymentFailureHook(YarnClientApplication yarnApplication, Path yarnFilesDir) { + this.yarnApplication = Preconditions.checkNotNull(yarnApplication); + this.yarnFilesDir = Preconditions.checkNotNull(yarnFilesDir); + + // A new yarn client need to be created in shutdown hook in order to avoid + // the yarn client has been closed by YarnClusterDescriptor. + this.yarnClient = YarnClient.createYarnClient(); + this.yarnClient.init(yarnConfiguration); + } + + @Override + public void run() { + LOG.info("Cancelling deployment from Deployment Failure Hook"); + yarnClient.start(); + failSessionDuringDeployment(yarnClient, yarnApplication); + yarnClient.stop(); + LOG.info("Deleting files in {}.", yarnFilesDir); + try { + FileSystem fs = FileSystem.get(yarnConfiguration); + + if (!fs.delete(yarnFilesDir, true)) { + throw new IOException("Deleting files in " + yarnFilesDir + " was unsuccessful"); + } + + fs.close(); + } catch (IOException e) { + LOG.error("Failed to delete Flink Jar and configuration files in HDFS", e); + } + } + } + + @VisibleForTesting + void addLibFoldersToShipFiles(Collection effectiveShipFiles) { + // Add lib folder to the ship files if the environment variable is set. + // This is for convenience when running from the command-line. + // (for other files users explicitly set the ship files) + String libDir = System.getenv().get(ENV_FLINK_LIB_DIR); + if (libDir != null) { + File directoryFile = new File(libDir); + if (directoryFile.isDirectory()) { + effectiveShipFiles.add(directoryFile); + } else { + throw new YarnDeploymentException("The environment variable '" + + ENV_FLINK_LIB_DIR + + "' is set to '" + + libDir + + "' but the directory doesn't exist."); + } + } else if (shipFiles.isEmpty()) { + LOG.warn( + "Environment variable '{}' not set and ship files have not been provided manually. " + + "Not shipping any library files.", + ENV_FLINK_LIB_DIR); + } + } + + @VisibleForTesting + void addUsrLibFolderToShipFiles(Collection effectiveShipFiles) { + // Add usrlib folder to the ship files if it exists + // Classes in the folder will be loaded by UserClassLoader if CLASSPATH_INCLUDE_USER_JAR is + // DISABLED. + ClusterEntrypointUtils.tryFindUserLibDirectory().ifPresent(usrLibDirFile -> { + effectiveShipFiles.add(usrLibDirFile); + LOG.info("usrlib: {} will be shipped automatically.", usrLibDirFile.getAbsolutePath()); + }); + } + + @VisibleForTesting + void addPluginsFoldersToShipFiles(Collection effectiveShipFiles) { + final Optional pluginsDir = PluginConfig.getPluginsDir(); + pluginsDir.ifPresent(effectiveShipFiles::add); + } + + ContainerLaunchContext setupApplicationMasterContainer( + String yarnClusterEntrypoint, boolean hasKrb5, JobManagerProcessSpec processSpec) { + // ------------------ Prepare Application Master Container ------------------------------ + + // respect custom JVM options in the YAML file + String javaOpts = flinkConfiguration.getString(CoreOptions.FLINK_JVM_OPTIONS); + if (flinkConfiguration.getString(CoreOptions.FLINK_JM_JVM_OPTIONS).length() > 0) { + javaOpts += " " + flinkConfiguration.getString(CoreOptions.FLINK_JM_JVM_OPTIONS); + } + + // krb5.conf file will be available as local resource in JM/TM container + if (hasKrb5) { + javaOpts += " -Djava.security.krb5.conf=krb5.conf"; + } + + // Set up the container launch context for the application master + ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); + + final Map startCommandValues = new HashMap<>(); + startCommandValues.put("java", "$JAVA_HOME/bin/java"); + + String jvmHeapMem = JobManagerProcessUtils.generateJvmParametersStr(processSpec, flinkConfiguration); + startCommandValues.put("jvmmem", jvmHeapMem); + + startCommandValues.put("jvmopts", javaOpts); + startCommandValues.put("logging", YarnLogConfigUtil.getLoggingYarnCommand(flinkConfiguration)); + + startCommandValues.put("class", yarnClusterEntrypoint); + startCommandValues.put( + "redirects", + "1> " + + ApplicationConstants.LOG_DIR_EXPANSION_VAR + + "/jobmanager.out " + + "2> " + + ApplicationConstants.LOG_DIR_EXPANSION_VAR + + "/jobmanager.err"); + String dynamicParameterListStr = JobManagerProcessUtils.generateDynamicConfigsStr(processSpec); + startCommandValues.put("args", dynamicParameterListStr); + + final String commandTemplate = flinkConfiguration.getString( + ConfigConstants.YARN_CONTAINER_START_COMMAND_TEMPLATE, + ConfigConstants.DEFAULT_YARN_CONTAINER_START_COMMAND_TEMPLATE); + final String amCommand = BootstrapTools.getStartCommand(commandTemplate, startCommandValues); + + amContainer.setCommands(Collections.singletonList(amCommand)); + + LOG.debug("Application Master start command: " + amCommand); + + return amContainer; + } + + private static YarnConfigOptions.UserJarInclusion getUserJarInclusionMode( + org.apache.flink.configuration.Configuration config) { + return config.get(YarnConfigOptions.CLASSPATH_INCLUDE_USER_JAR); + } + + private static boolean isUsrLibDirIncludedInShipFiles(List shipFiles) { + return shipFiles.stream() + .filter(File::isDirectory) + .map(File::getName) + .anyMatch(name -> name.equals(DEFAULT_FLINK_USR_LIB_DIR)); + } + + private void setClusterEntrypointInfoToConfig(final ApplicationReport report) { + checkNotNull(report); + + final ApplicationId appId = report.getApplicationId(); + final String host = report.getHost(); + final int port = report.getRpcPort(); + + LOG.info("Found Web Interface {}:{} of application '{}'.", host, port, appId); + + flinkConfiguration.setString(JobManagerOptions.ADDRESS, host); + flinkConfiguration.setInteger(JobManagerOptions.PORT, port); + + flinkConfiguration.setString(RestOptions.ADDRESS, host); + flinkConfiguration.setInteger(RestOptions.PORT, port); + + flinkConfiguration.set(YarnConfigOptions.APPLICATION_ID, ConverterUtils.toString(appId)); + + setHAClusterIdIfNotSet(flinkConfiguration, appId); + } + + private void setHAClusterIdIfNotSet(Configuration configuration, ApplicationId appId) { + // set cluster-id to app id if not specified + if (!configuration.contains(HighAvailabilityOptions.HA_CLUSTER_ID)) { + configuration.set(HighAvailabilityOptions.HA_CLUSTER_ID, ConverterUtils.toString(appId)); + } + } + + public static void logDetachedClusterInformation(ApplicationId yarnApplicationId, Logger logger) { + logger.info( + "The Flink YARN session cluster has been started in detached mode. In order to " + + "stop Flink gracefully, use the following command:\n" + + "$ echo \"stop\" | ./bin/yarn-session.sh -id {}\n" + + "If this should not be possible, then you can also kill Flink via YARN's web interface or via:\n" + + "$ yarn application -kill {}\n" + + "Note that killing Flink might not clean up all job artifacts and temporary files.", + yarnApplicationId, + yarnApplicationId); + } +} diff --git a/dinky-client/dinky-client-1.15/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java b/dinky-client/dinky-client-1.15/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java index 037b8d1aab..20df2c61a2 100644 --- a/dinky-client/dinky-client-1.15/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java +++ b/dinky-client/dinky-client-1.15/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java @@ -66,6 +66,7 @@ import org.apache.flink.types.Row; import java.io.File; +import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -78,6 +79,7 @@ import com.fasterxml.jackson.databind.node.ObjectNode; import cn.hutool.core.collection.CollUtil; +import cn.hutool.core.util.URLUtil; /** * CustomTableEnvironmentImpl @@ -197,14 +199,14 @@ public ObjectNode getStreamGraph(String statement) { @Override public void addJar(File... jarPath) { - Configuration configuration = this.getRootConfiguration(); + Configuration configuration = new Configuration(this.getRootConfiguration()); + List pathList = + Arrays.stream(URLUtil.getURLs(jarPath)).map(URL::toString).collect(Collectors.toList()); List jars = configuration.get(PipelineOptions.JARS); if (jars == null) { - configuration.set( - PipelineOptions.JARS, - Arrays.stream(jarPath).map(File::getAbsolutePath).collect(Collectors.toList())); + configuration.set(PipelineOptions.JARS, pathList); } else { - CollUtil.addAll(jars, jarPath); + CollUtil.addAll(jars, pathList); } } diff --git a/dinky-client/dinky-client-1.16/src/main/java/org/apache/flink/yarn/Utils.java b/dinky-client/dinky-client-1.16/src/main/java/org/apache/flink/yarn/Utils.java new file mode 100644 index 0000000000..e218ba01d0 --- /dev/null +++ b/dinky-client/dinky-client-1.16/src/main/java/org/apache/flink/yarn/Utils.java @@ -0,0 +1,692 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.flink.yarn; + +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.yarn.YarnConfigKeys.ENV_FLINK_CLASSPATH; +import static org.apache.flink.yarn.YarnConfigKeys.LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigUtils; +import org.apache.flink.runtime.clusterframework.BootstrapTools; +import org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters; +import org.apache.flink.runtime.security.token.DelegationTokenConverter; +import org.apache.flink.runtime.util.HadoopUtils; +import org.apache.flink.util.StringUtils; +import org.apache.flink.util.function.FunctionWithException; +import org.apache.flink.yarn.configuration.YarnConfigOptions; +import org.apache.flink.yarn.configuration.YarnResourceManagerDriverConfiguration; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.mapreduce.security.TokenCache; +import org.apache.hadoop.security.Credentials; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.security.token.TokenIdentifier; +import org.apache.hadoop.util.StringInterner; +import org.apache.hadoop.yarn.api.ApplicationConstants; +import org.apache.hadoop.yarn.api.ApplicationConstants.Environment; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.LocalResource; +import org.apache.hadoop.yarn.api.records.LocalResourceType; +import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.security.AMRMTokenIdentifier; +import org.apache.hadoop.yarn.util.ConverterUtils; +import org.apache.hadoop.yarn.util.Records; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.stream.Stream; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import cn.hutool.core.util.StrUtil; + +/** + * Utility class that provides helper methods to work with Apache Hadoop YARN. + */ +public final class Utils { + + private static final Logger LOG = LoggerFactory.getLogger(Utils.class); + + /** + * KRB5 file name populated in YARN container for secure IT run. + */ + public static final String KRB5_FILE_NAME = "krb5.conf"; + + /** + * Yarn site xml file name populated in YARN container for secure IT run. + */ + public static final String YARN_SITE_FILE_NAME = "yarn-site.xml"; + + /** + * The prefixes that Flink adds to the YARN config. + */ + private static final String[] FLINK_CONFIG_PREFIXES = {"flink.yarn."}; + + @VisibleForTesting + static final String YARN_RM_FAIR_SCHEDULER_CLAZZ = + "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler"; + + @VisibleForTesting + static final String YARN_RM_SLS_FAIR_SCHEDULER_CLAZZ = "org.apache.hadoop.yarn.sls.scheduler.SLSFairScheduler"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_MB_KEY = "yarn.resource-types.memory-mb.increment-allocation"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_MB_LEGACY_KEY = "yarn.scheduler.increment-allocation-mb"; + + private static final int DEFAULT_YARN_RM_INCREMENT_ALLOCATION_MB = 1024; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_VCORES_KEY = "yarn.resource-types.vcores.increment-allocation"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_VCORES_LEGACY_KEY = "yarn.scheduler.increment-allocation-vcores"; + + private static final int DEFAULT_YARN_RM_INCREMENT_ALLOCATION_VCORES = 1; + + public static void setupYarnClassPath(Configuration conf, Map appMasterEnv) { + addToEnvironment(appMasterEnv, Environment.CLASSPATH.name(), appMasterEnv.get(ENV_FLINK_CLASSPATH)); + String[] applicationClassPathEntries = conf.getStrings( + YarnConfiguration.YARN_APPLICATION_CLASSPATH, + Stream.of(YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH) + .map(x -> StrUtil.removeAll(x, "%")) + .map(x -> "$".equals(StrUtil.subPre(x, 1)) ? x : "$" + x) + .toArray(String[]::new)); + for (String c : applicationClassPathEntries) { + addToEnvironment(appMasterEnv, Environment.CLASSPATH.name(), c.trim()); + } + } + + /** + * Deletes the YARN application files, e.g., Flink binaries, libraries, etc., from the remote + * filesystem. + * + * @param applicationFilesDir The application files directory. + */ + public static void deleteApplicationFiles(final String applicationFilesDir) { + if (!StringUtils.isNullOrWhitespaceOnly(applicationFilesDir)) { + final org.apache.flink.core.fs.Path path = new org.apache.flink.core.fs.Path(applicationFilesDir); + try { + final org.apache.flink.core.fs.FileSystem fileSystem = path.getFileSystem(); + if (!fileSystem.delete(path, true)) { + LOG.error("Deleting yarn application files under {} was unsuccessful.", applicationFilesDir); + } + } catch (final IOException e) { + LOG.error("Could not properly delete yarn application files directory {}.", applicationFilesDir, e); + } + } else { + LOG.debug("No yarn application files directory set. Therefore, cannot clean up the data."); + } + } + + /** + * Creates a YARN resource for the remote object at the given location. + * + * @param remoteRsrcPath remote location of the resource + * @param resourceSize size of the resource + * @param resourceModificationTime last modification time of the resource + * @return YARN resource + */ + static LocalResource registerLocalResource( + Path remoteRsrcPath, + long resourceSize, + long resourceModificationTime, + LocalResourceVisibility resourceVisibility, + LocalResourceType resourceType) { + LocalResource localResource = Records.newRecord(LocalResource.class); + localResource.setResource(ConverterUtils.getYarnUrlFromURI(remoteRsrcPath.toUri())); + localResource.setSize(resourceSize); + localResource.setTimestamp(resourceModificationTime); + localResource.setType(resourceType); + localResource.setVisibility(resourceVisibility); + return localResource; + } + + /** + * Creates a YARN resource for the remote object at the given location. + * + * @param fs remote filesystem + * @param remoteRsrcPath resource path to be registered + * @return YARN resource + */ + private static LocalResource registerLocalResource( + FileSystem fs, Path remoteRsrcPath, LocalResourceType resourceType) throws IOException { + FileStatus jarStat = fs.getFileStatus(remoteRsrcPath); + return registerLocalResource( + remoteRsrcPath, + jarStat.getLen(), + jarStat.getModificationTime(), + LocalResourceVisibility.APPLICATION, + resourceType); + } + + public static void setTokensFor( + ContainerLaunchContext amContainer, List paths, Configuration conf, boolean obtainingDelegationTokens) + throws IOException { + Credentials credentials = new Credentials(); + + if (obtainingDelegationTokens) { + LOG.info("Obtaining delegation tokens for HDFS and HBase."); + // for HDFS + TokenCache.obtainTokensForNamenodes(credentials, paths.toArray(new Path[0]), conf); + // for HBase + obtainTokenForHBase(credentials, conf); + } else { + LOG.info("Delegation token retrieval for HDFS and HBase is disabled."); + } + + // for user + UserGroupInformation currUsr = UserGroupInformation.getCurrentUser(); + for (Token token : currUsr.getCredentials().getAllTokens()) { + LOG.info("Adding user token " + token.getService() + " with " + token); + credentials.addToken(token.getService(), token); + } + + ByteBuffer tokens = ByteBuffer.wrap(DelegationTokenConverter.serialize(credentials)); + amContainer.setTokens(tokens); + } + + /** + * Obtain Kerberos security token for HBase. + */ + private static void obtainTokenForHBase(Credentials credentials, Configuration conf) throws IOException { + if (UserGroupInformation.isSecurityEnabled()) { + LOG.info("Attempting to obtain Kerberos security token for HBase"); + try { + // ---- + // Intended call: HBaseConfiguration.addHbaseResources(conf); + Class.forName("org.apache.hadoop.hbase.HBaseConfiguration") + .getMethod("addHbaseResources", Configuration.class) + .invoke(null, conf); + // ---- + + LOG.info("HBase security setting: {}", conf.get("hbase.security.authentication")); + + if (!"kerberos".equals(conf.get("hbase.security.authentication"))) { + LOG.info("HBase has not been configured to use Kerberos."); + return; + } + + Token token; + try { + LOG.info("Obtaining Kerberos security token for HBase"); + // ---- + // Intended call: Token token = + // TokenUtil.obtainToken(conf); + token = (Token) Class.forName("org.apache.hadoop.hbase.security.token.TokenUtil") + .getMethod("obtainToken", Configuration.class) + .invoke(null, conf); + // ---- + } catch (NoSuchMethodException e) { + // for HBase 2 + + // ---- + // Intended call: ConnectionFactory connectionFactory = + // ConnectionFactory.createConnection(conf); + Closeable connectionFactory = + (Closeable) Class.forName("org.apache.hadoop.hbase.client.ConnectionFactory") + .getMethod("createConnection", Configuration.class) + .invoke(null, conf); + // ---- + Class connectionClass = Class.forName("org.apache.hadoop.hbase.client.Connection"); + // ---- + // Intended call: Token token = + // TokenUtil.obtainToken(connectionFactory); + token = (Token) Class.forName("org.apache.hadoop.hbase.security.token.TokenUtil") + .getMethod("obtainToken", connectionClass) + .invoke(null, connectionFactory); + // ---- + if (null != connectionFactory) { + connectionFactory.close(); + } + } + + if (token == null) { + LOG.error("No Kerberos security token for HBase available"); + return; + } + + credentials.addToken(token.getService(), token); + LOG.info("Added HBase Kerberos security token to credentials."); + } catch (ClassNotFoundException + | NoSuchMethodException + | IllegalAccessException + | InvocationTargetException e) { + LOG.info( + "HBase is not available (not packaged with this application): {} : \"{}\".", + e.getClass().getSimpleName(), + e.getMessage()); + } + } + } + + /** + * Copied method from org.apache.hadoop.yarn.util.Apps. It was broken by YARN-1824 (2.4.0) and + * fixed for 2.4.1 by https://issues.apache.org/jira/browse/YARN-1931 + */ + public static void addToEnvironment(Map environment, String variable, String value) { + String val = environment.get(variable); + if (val == null) { + val = value; + } else { + val = val + YarnClusterDescriptor.pathSeparator + value; + } + environment.put(StringInterner.weakIntern(variable), StringInterner.weakIntern(val)); + } + + /** + * Resolve keytab path either as absolute path or relative to working directory. + * + * @param workingDir current working directory + * @param keytabPath configured keytab path. + * @return resolved keytab path, or null if not found. + */ + public static String resolveKeytabPath(String workingDir, String keytabPath) { + String keytab = null; + if (keytabPath != null) { + File f; + f = new File(keytabPath); + if (f.exists()) { + keytab = f.getAbsolutePath(); + LOG.info("Resolved keytab path: {}", keytab); + } else { + // try using relative paths, this is the case when the keytab was shipped + // as a local resource + f = new File(workingDir, keytabPath); + if (f.exists()) { + keytab = f.getAbsolutePath(); + LOG.info("Resolved keytab path: {}", keytab); + } else { + LOG.warn("Could not resolve keytab path with: {}", keytabPath); + keytab = null; + } + } + } + return keytab; + } + + /** + * Private constructor to prevent instantiation. + */ + private Utils() { + throw new RuntimeException(); + } + + /** + * Creates the launch context, which describes how to bring up a TaskExecutor / TaskManager + * process in an allocated YARN container. + * + *

This code is extremely YARN specific and registers all the resources that the TaskExecutor + * needs (such as JAR file, config file, ...) and all environment variables in a YARN container + * launch context. The launch context then ensures that those resources will be copied into the + * containers transient working directory. + * + * @param flinkConfig The Flink configuration object. + * @param yarnConfig The YARN configuration object. + * @param configuration The YarnResourceManagerDriver configurations. + * @param tmParams The TaskExecutor container memory parameters. + * @param taskManagerDynamicProperties The dynamic configurations to be updated for the + * TaskExecutors based on client uploaded Flink config. + * @param workingDirectory The current application master container's working directory. + * @param taskManagerMainClass The class with the main method. + * @param log The logger. + * @return The launch context for the TaskManager processes. + * @throws Exception Thrown if the launch context could not be created, for example if the + * resources could not be copied. + */ + static ContainerLaunchContext createTaskExecutorContext( + org.apache.flink.configuration.Configuration flinkConfig, + YarnConfiguration yarnConfig, + YarnResourceManagerDriverConfiguration configuration, + ContaineredTaskManagerParameters tmParams, + String taskManagerDynamicProperties, + String workingDirectory, + Class taskManagerMainClass, + Logger log) + throws Exception { + + // get and validate all relevant variables + + String remoteFlinkJarPath = checkNotNull( + configuration.getFlinkDistJar(), "Environment variable %s not set", YarnConfigKeys.FLINK_DIST_JAR); + + String shipListString = checkNotNull( + configuration.getClientShipFiles(), + "Environment variable %s not set", + YarnConfigKeys.ENV_CLIENT_SHIP_FILES); + + final String remoteKeytabPath = configuration.getRemoteKeytabPath(); + final String localKeytabPath = configuration.getLocalKeytabPath(); + final String keytabPrincipal = configuration.getKeytabPrinciple(); + final String remoteYarnConfPath = configuration.getYarnSiteXMLPath(); + final String remoteKrb5Path = configuration.getKrb5Path(); + + if (log.isDebugEnabled()) { + log.debug("TM:remote keytab path obtained {}", remoteKeytabPath); + log.debug("TM:local keytab path obtained {}", localKeytabPath); + log.debug("TM:keytab principal obtained {}", keytabPrincipal); + log.debug("TM:remote yarn conf path obtained {}", remoteYarnConfPath); + log.debug("TM:remote krb5 path obtained {}", remoteKrb5Path); + } + + String classPathString = checkNotNull( + configuration.getFlinkClasspath(), + "Environment variable %s not set", + YarnConfigKeys.ENV_FLINK_CLASSPATH); + + // register keytab + LocalResource keytabResource = null; + if (remoteKeytabPath != null) { + log.info("TM:Adding keytab {} to the container local resource bucket", remoteKeytabPath); + Path keytabPath = new Path(remoteKeytabPath); + FileSystem fs = keytabPath.getFileSystem(yarnConfig); + keytabResource = registerLocalResource(fs, keytabPath, LocalResourceType.FILE); + } + + // To support Yarn Secure Integration Test Scenario + LocalResource yarnConfResource = null; + if (remoteYarnConfPath != null) { + log.info("TM:Adding remoteYarnConfPath {} to the container local resource bucket", remoteYarnConfPath); + Path yarnConfPath = new Path(remoteYarnConfPath); + FileSystem fs = yarnConfPath.getFileSystem(yarnConfig); + yarnConfResource = registerLocalResource(fs, yarnConfPath, LocalResourceType.FILE); + } + + // register krb5.conf + LocalResource krb5ConfResource = null; + boolean hasKrb5 = false; + if (remoteKrb5Path != null) { + log.info("Adding remoteKrb5Path {} to the container local resource bucket", remoteKrb5Path); + Path krb5ConfPath = new Path(remoteKrb5Path); + FileSystem fs = krb5ConfPath.getFileSystem(yarnConfig); + krb5ConfResource = registerLocalResource(fs, krb5ConfPath, LocalResourceType.FILE); + hasKrb5 = true; + } + + Map taskManagerLocalResources = new HashMap<>(); + + // register Flink Jar with remote HDFS + final YarnLocalResourceDescriptor flinkDistLocalResourceDesc = + YarnLocalResourceDescriptor.fromString(remoteFlinkJarPath); + taskManagerLocalResources.put( + flinkDistLocalResourceDesc.getResourceKey(), flinkDistLocalResourceDesc.toLocalResource()); + + // To support Yarn Secure Integration Test Scenario + if (yarnConfResource != null) { + taskManagerLocalResources.put(YARN_SITE_FILE_NAME, yarnConfResource); + } + if (krb5ConfResource != null) { + taskManagerLocalResources.put(KRB5_FILE_NAME, krb5ConfResource); + } + if (keytabResource != null) { + taskManagerLocalResources.put(localKeytabPath, keytabResource); + } + + // prepare additional files to be shipped + decodeYarnLocalResourceDescriptorListFromString(shipListString) + .forEach(resourceDesc -> + taskManagerLocalResources.put(resourceDesc.getResourceKey(), resourceDesc.toLocalResource())); + + // now that all resources are prepared, we can create the launch context + + log.info("Creating container launch context for TaskManagers"); + + boolean hasLogback = new File(workingDirectory, "logback.xml").exists(); + boolean hasLog4j = new File(workingDirectory, "log4j.properties").exists(); + + String launchCommand = BootstrapTools.getTaskManagerShellCommand( + flinkConfig, + tmParams, + ".", + ApplicationConstants.LOG_DIR_EXPANSION_VAR, + hasLogback, + hasLog4j, + hasKrb5, + taskManagerMainClass, + taskManagerDynamicProperties); + + if (log.isDebugEnabled()) { + log.debug("Starting TaskManagers with command: " + launchCommand); + } else { + log.info("Starting TaskManagers"); + } + + ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class); + ctx.setCommands(Collections.singletonList(launchCommand)); + ctx.setLocalResources(taskManagerLocalResources); + + Map containerEnv = new HashMap<>(); + containerEnv.putAll(tmParams.taskManagerEnv()); + + // add YARN classpath, etc to the container environment + containerEnv.put(ENV_FLINK_CLASSPATH, classPathString); + setupYarnClassPath(yarnConfig, containerEnv); + + containerEnv.put( + YarnConfigKeys.ENV_HADOOP_USER_NAME, + UserGroupInformation.getCurrentUser().getUserName()); + + if (remoteKeytabPath != null && localKeytabPath != null && keytabPrincipal != null) { + containerEnv.put(YarnConfigKeys.REMOTE_KEYTAB_PATH, remoteKeytabPath); + containerEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localKeytabPath); + containerEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, keytabPrincipal); + } else if (localKeytabPath != null && keytabPrincipal != null) { + containerEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localKeytabPath); + containerEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, keytabPrincipal); + } + + ctx.setEnvironment(containerEnv); + + // For TaskManager YARN container context, read the tokens from the jobmanager yarn + // container local file. + // NOTE: must read the tokens from the local file, not from the UGI context, because if UGI + // is login + // using Kerberos keytabs, there is no HDFS delegation token in the UGI context. + final String fileLocation = System.getenv(UserGroupInformation.HADOOP_TOKEN_FILE_LOCATION); + + if (fileLocation != null) { + log.debug("Adding security tokens to TaskExecutor's container launch context."); + + try (DataOutputBuffer dob = new DataOutputBuffer()) { + Credentials cred = Credentials.readTokenStorageFile( + new File(fileLocation), HadoopUtils.getHadoopConfiguration(flinkConfig)); + + // Filter out AMRMToken before setting the tokens to the TaskManager container + // context. + Credentials taskManagerCred = new Credentials(); + Collection> userTokens = cred.getAllTokens(); + for (Token token : userTokens) { + if (!token.getKind().equals(AMRMTokenIdentifier.KIND_NAME)) { + taskManagerCred.addToken(token.getService(), token); + } + } + + taskManagerCred.writeTokenStorageToStream(dob); + ByteBuffer securityTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); + ctx.setTokens(securityTokens); + } catch (Throwable t) { + log.error("Failed to add Hadoop's security tokens.", t); + } + } else { + log.info("Could not set security tokens because Hadoop's token file location is unknown."); + } + + return ctx; + } + + static boolean isRemotePath(String path) throws IOException { + org.apache.flink.core.fs.Path flinkPath = new org.apache.flink.core.fs.Path(path); + return flinkPath.getFileSystem().isDistributedFS(); + } + + private static List decodeYarnLocalResourceDescriptorListFromString(String resources) + throws Exception { + final List resourceDescriptors = new ArrayList<>(); + for (String shipResourceDescStr : resources.split(LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR)) { + if (!shipResourceDescStr.isEmpty()) { + resourceDescriptors.add(YarnLocalResourceDescriptor.fromString(shipResourceDescStr)); + } + } + return resourceDescriptors; + } + + @VisibleForTesting + static Resource getUnitResource(YarnConfiguration yarnConfig) { + final int unitMemMB, unitVcore; + + final String yarnRmSchedulerClazzName = yarnConfig.get(YarnConfiguration.RM_SCHEDULER); + if (Objects.equals(yarnRmSchedulerClazzName, YARN_RM_FAIR_SCHEDULER_CLAZZ) + || Objects.equals(yarnRmSchedulerClazzName, YARN_RM_SLS_FAIR_SCHEDULER_CLAZZ)) { + String propMem = yarnConfig.get(YARN_RM_INCREMENT_ALLOCATION_MB_KEY); + String propVcore = yarnConfig.get(YARN_RM_INCREMENT_ALLOCATION_VCORES_KEY); + + unitMemMB = propMem != null + ? Integer.parseInt(propMem) + : yarnConfig.getInt( + YARN_RM_INCREMENT_ALLOCATION_MB_LEGACY_KEY, DEFAULT_YARN_RM_INCREMENT_ALLOCATION_MB); + unitVcore = propVcore != null + ? Integer.parseInt(propVcore) + : yarnConfig.getInt( + YARN_RM_INCREMENT_ALLOCATION_VCORES_LEGACY_KEY, + DEFAULT_YARN_RM_INCREMENT_ALLOCATION_VCORES); + } else { + unitMemMB = yarnConfig.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB); + unitVcore = yarnConfig.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES); + } + + return Resource.newInstance(unitMemMB, unitVcore); + } + + public static List getQualifiedRemoteProvidedLibDirs( + org.apache.flink.configuration.Configuration configuration, YarnConfiguration yarnConfiguration) + throws IOException { + + return getRemoteSharedLibPaths(configuration, pathStr -> { + final Path path = new Path(pathStr); + return path.getFileSystem(yarnConfiguration).makeQualified(path); + }); + } + + private static List getRemoteSharedLibPaths( + org.apache.flink.configuration.Configuration configuration, + FunctionWithException strToPathMapper) + throws IOException { + + final List providedLibDirs = + ConfigUtils.decodeListFromConfig(configuration, YarnConfigOptions.PROVIDED_LIB_DIRS, strToPathMapper); + + for (Path path : providedLibDirs) { + if (!Utils.isRemotePath(path.toString())) { + throw new IllegalArgumentException("The \"" + + YarnConfigOptions.PROVIDED_LIB_DIRS.key() + + "\" should only contain" + + " dirs accessible from all worker nodes, while the \"" + + path + + "\" is local."); + } + } + return providedLibDirs; + } + + public static boolean isUsrLibDirectory(final FileSystem fileSystem, final Path path) throws IOException { + final FileStatus fileStatus = fileSystem.getFileStatus(path); + // Use the Path obj from fileStatus to get rid of trailing slash + return fileStatus.isDirectory() + && ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR.equals( + fileStatus.getPath().getName()); + } + + public static Optional getQualifiedRemoteProvidedUsrLib( + org.apache.flink.configuration.Configuration configuration, YarnConfiguration yarnConfiguration) + throws IOException, IllegalArgumentException { + String usrlib = configuration.getString(YarnConfigOptions.PROVIDED_USRLIB_DIR); + if (usrlib == null) { + return Optional.empty(); + } + final Path qualifiedUsrLibPath = FileSystem.get(yarnConfiguration).makeQualified(new Path(usrlib)); + checkArgument( + isRemotePath(qualifiedUsrLibPath.toString()), + "The \"%s\" must point to a remote dir " + "which is accessible from all worker nodes.", + YarnConfigOptions.PROVIDED_USRLIB_DIR.key()); + checkArgument( + isUsrLibDirectory(FileSystem.get(yarnConfiguration), qualifiedUsrLibPath), + "The \"%s\" should be named with \"%s\".", + YarnConfigOptions.PROVIDED_USRLIB_DIR.key(), + ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR); + return Optional.of(qualifiedUsrLibPath); + } + + public static YarnConfiguration getYarnAndHadoopConfiguration( + org.apache.flink.configuration.Configuration flinkConfig) { + final YarnConfiguration yarnConfig = getYarnConfiguration(flinkConfig); + yarnConfig.addResource(HadoopUtils.getHadoopConfiguration(flinkConfig)); + + return yarnConfig; + } + + /** + * Add additional config entries from the flink config to the yarn config. + * + * @param flinkConfig The Flink configuration object. + * @return The yarn configuration. + */ + public static YarnConfiguration getYarnConfiguration(org.apache.flink.configuration.Configuration flinkConfig) { + final YarnConfiguration yarnConfig = new YarnConfiguration(); + + for (String key : flinkConfig.keySet()) { + for (String prefix : FLINK_CONFIG_PREFIXES) { + if (key.startsWith(prefix)) { + String newKey = key.substring("flink.".length()); + String value = flinkConfig.getString(key, null); + yarnConfig.set(newKey, value); + LOG.debug("Adding Flink config entry for {} as {}={} to Yarn config", key, newKey, value); + } + } + } + + return yarnConfig; + } +} diff --git a/dinky-client/dinky-client-1.16/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java b/dinky-client/dinky-client-1.16/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java new file mode 100644 index 0000000000..d4ae67ad07 --- /dev/null +++ b/dinky-client/dinky-client-1.16/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java @@ -0,0 +1,1737 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.flink.yarn; + +import static org.apache.flink.client.deployment.application.ApplicationConfiguration.APPLICATION_MAIN_CLASS; +import static org.apache.flink.configuration.ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR; +import static org.apache.flink.configuration.ConfigConstants.ENV_FLINK_LIB_DIR; +import static org.apache.flink.configuration.ConfigConstants.ENV_FLINK_OPT_DIR; +import static org.apache.flink.runtime.entrypoint.component.FileJobGraphRetriever.JOB_GRAPH_FILE_PATH; +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.yarn.YarnConfigKeys.ENV_FLINK_CLASSPATH; +import static org.apache.flink.yarn.YarnConfigKeys.LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.cache.DistributedCache; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.client.deployment.ClusterDeploymentException; +import org.apache.flink.client.deployment.ClusterDescriptor; +import org.apache.flink.client.deployment.ClusterRetrieveException; +import org.apache.flink.client.deployment.ClusterSpecification; +import org.apache.flink.client.deployment.application.ApplicationConfiguration; +import org.apache.flink.client.program.ClusterClientProvider; +import org.apache.flink.client.program.PackagedProgramUtils; +import org.apache.flink.client.program.rest.RestClusterClient; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigUtils; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ConfigurationUtils; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.configuration.HighAvailabilityOptions; +import org.apache.flink.configuration.IllegalConfigurationException; +import org.apache.flink.configuration.JobManagerOptions; +import org.apache.flink.configuration.PipelineOptions; +import org.apache.flink.configuration.ResourceManagerOptions; +import org.apache.flink.configuration.RestOptions; +import org.apache.flink.configuration.SecurityOptions; +import org.apache.flink.configuration.TaskManagerOptions; +import org.apache.flink.core.plugin.PluginConfig; +import org.apache.flink.core.plugin.PluginUtils; +import org.apache.flink.runtime.clusterframework.BootstrapTools; +import org.apache.flink.runtime.entrypoint.ClusterEntrypoint; +import org.apache.flink.runtime.entrypoint.ClusterEntrypointUtils; +import org.apache.flink.runtime.jobgraph.JobGraph; +import org.apache.flink.runtime.jobmanager.HighAvailabilityMode; +import org.apache.flink.runtime.jobmanager.JobManagerProcessSpec; +import org.apache.flink.runtime.jobmanager.JobManagerProcessUtils; +import org.apache.flink.runtime.security.token.DelegationTokenConverter; +import org.apache.flink.runtime.security.token.DelegationTokenManager; +import org.apache.flink.runtime.security.token.KerberosDelegationTokenManager; +import org.apache.flink.runtime.util.HadoopUtils; +import org.apache.flink.util.CollectionUtil; +import org.apache.flink.util.FlinkException; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.ShutdownHookUtil; +import org.apache.flink.util.StringUtils; +import org.apache.flink.yarn.configuration.YarnConfigOptions; +import org.apache.flink.yarn.configuration.YarnConfigOptionsInternal; +import org.apache.flink.yarn.configuration.YarnDeploymentTarget; +import org.apache.flink.yarn.configuration.YarnLogConfigUtil; +import org.apache.flink.yarn.entrypoint.YarnApplicationClusterEntryPoint; +import org.apache.flink.yarn.entrypoint.YarnJobClusterEntrypoint; +import org.apache.flink.yarn.entrypoint.YarnSessionClusterEntrypoint; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.security.Credentials; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.yarn.api.ApplicationConstants; +import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.ApplicationReport; +import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; +import org.apache.hadoop.yarn.api.records.LocalResourceType; +import org.apache.hadoop.yarn.api.records.NodeReport; +import org.apache.hadoop.yarn.api.records.NodeState; +import org.apache.hadoop.yarn.api.records.Priority; +import org.apache.hadoop.yarn.api.records.QueueInfo; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.YarnApplicationState; +import org.apache.hadoop.yarn.api.records.YarnClusterMetrics; +import org.apache.hadoop.yarn.client.api.YarnClient; +import org.apache.hadoop.yarn.client.api.YarnClientApplication; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.util.ConverterUtils; +import org.apache.hadoop.yarn.util.Records; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.PrintStream; +import java.io.UnsupportedEncodingException; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.net.URI; +import java.net.URLDecoder; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** The descriptor with deployment information for deploying a Flink cluster on Yarn. */ +public class YarnClusterDescriptor implements ClusterDescriptor { + public static final String pathSeparator = ":"; + private static final Logger LOG = LoggerFactory.getLogger(YarnClusterDescriptor.class); + + private final YarnConfiguration yarnConfiguration; + + private final YarnClient yarnClient; + + private final YarnClusterInformationRetriever yarnClusterInformationRetriever; + + /** True if the descriptor must not shut down the YarnClient. */ + private final boolean sharedYarnClient; + + /** Lazily initialized list of files to ship. */ + private final List shipFiles = new LinkedList<>(); + + private final List shipArchives = new LinkedList<>(); + + private final String yarnQueue; + + private Path flinkJarPath; + + private final Configuration flinkConfiguration; + + private final String customName; + + private final String nodeLabel; + + private final String applicationType; + + private YarnConfigOptions.UserJarInclusion userJarInclusion; + + public YarnClusterDescriptor( + Configuration flinkConfiguration, + YarnConfiguration yarnConfiguration, + YarnClient yarnClient, + YarnClusterInformationRetriever yarnClusterInformationRetriever, + boolean sharedYarnClient) { + + this.yarnConfiguration = Preconditions.checkNotNull(yarnConfiguration); + this.yarnClient = Preconditions.checkNotNull(yarnClient); + this.yarnClusterInformationRetriever = Preconditions.checkNotNull(yarnClusterInformationRetriever); + this.sharedYarnClient = sharedYarnClient; + + this.flinkConfiguration = Preconditions.checkNotNull(flinkConfiguration); + this.userJarInclusion = getUserJarInclusionMode(flinkConfiguration); + + getLocalFlinkDistPath(flinkConfiguration).ifPresent(this::setLocalJarPath); + decodeFilesToShipToCluster(flinkConfiguration, YarnConfigOptions.SHIP_FILES) + .ifPresent(this::addShipFiles); + decodeFilesToShipToCluster(flinkConfiguration, YarnConfigOptions.SHIP_ARCHIVES) + .ifPresent(this::addShipArchives); + + this.yarnQueue = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_QUEUE); + this.customName = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_NAME); + this.applicationType = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_TYPE); + this.nodeLabel = flinkConfiguration.getString(YarnConfigOptions.NODE_LABEL); + } + + private Optional> decodeFilesToShipToCluster( + final Configuration configuration, final ConfigOption> configOption) { + checkNotNull(configuration); + checkNotNull(configOption); + + final List files = ConfigUtils.decodeListFromConfig(configuration, configOption, File::new); + return files.isEmpty() ? Optional.empty() : Optional.of(files); + } + + private Optional getLocalFlinkDistPath(final Configuration configuration) { + final String localJarPath = configuration.getString(YarnConfigOptions.FLINK_DIST_JAR); + if (localJarPath != null) { + return Optional.of(new Path(localJarPath)); + } + + LOG.info("No path for the flink jar passed. Using the location of " + getClass() + " to locate the jar"); + + // check whether it's actually a jar file --> when testing we execute this class without a + // flink-dist jar + final String decodedPath = getDecodedJarPath(); + return decodedPath.endsWith(".jar") ? Optional.of(new Path(new File(decodedPath).toURI())) : Optional.empty(); + } + + private String getDecodedJarPath() { + final String encodedJarPath = YarnClusterClientFactory.class + .getProtectionDomain() + .getCodeSource() + .getLocation() + .getPath(); + try { + return URLDecoder.decode(encodedJarPath, Charset.defaultCharset().name()); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException("Couldn't decode the encoded Flink dist jar path: " + + encodedJarPath + + " You can supply a path manually via the command line."); + } + } + + @VisibleForTesting + List getShipFiles() { + return shipFiles; + } + + public YarnClient getYarnClient() { + return yarnClient; + } + + /** + * The class to start the application master with. This class runs the main method in case of + * session cluster. + */ + protected String getYarnSessionClusterEntrypoint() { + return YarnSessionClusterEntrypoint.class.getName(); + } + + /** + * The class to start the application master with. This class runs the main method in case of + * the job cluster. + */ + protected String getYarnJobClusterEntrypoint() { + return YarnJobClusterEntrypoint.class.getName(); + } + + public Configuration getFlinkConfiguration() { + return flinkConfiguration; + } + + public void setLocalJarPath(Path localJarPath) { + if (!localJarPath.toString().endsWith("jar")) { + throw new IllegalArgumentException( + "The passed jar path ('" + localJarPath + "') does not end with the 'jar' extension"); + } + this.flinkJarPath = localJarPath; + } + + /** + * Adds the given files to the list of files to ship. + * + *

Note that any file matching "flink-dist*.jar" will be excluded from the upload by + * {@link YarnApplicationFileUploader#registerMultipleLocalResources(Collection, String, + * LocalResourceType)} since we upload the Flink uber jar ourselves and do not need to deploy it + * multiple times. + * + * @param shipFiles files to ship + */ + public void addShipFiles(List shipFiles) { + checkArgument( + !isUsrLibDirIncludedInShipFiles(shipFiles), + "User-shipped directories configured via : %s should not include %s.", + YarnConfigOptions.SHIP_FILES.key(), + ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR); + this.shipFiles.addAll(shipFiles); + } + + private void addShipArchives(List shipArchives) { + checkArgument(isArchiveOnlyIncludedInShipArchiveFiles(shipArchives), "Non-archive files are included."); + this.shipArchives.addAll(shipArchives); + } + + private static boolean isArchiveOnlyIncludedInShipArchiveFiles(List shipFiles) { + return shipFiles.stream() + .filter(File::isFile) + .map(File::getName) + .map(String::toLowerCase) + .allMatch(name -> name.endsWith(".tar.gz") + || name.endsWith(".tar") + || name.endsWith(".tgz") + || name.endsWith(".dst") + || name.endsWith(".jar") + || name.endsWith(".zip")); + } + + private void isReadyForDeployment(ClusterSpecification clusterSpecification) throws Exception { + + if (this.flinkJarPath == null) { + throw new YarnDeploymentException("The Flink jar path is null"); + } + if (this.flinkConfiguration == null) { + throw new YarnDeploymentException("Flink configuration object has not been set"); + } + + // Check if we don't exceed YARN's maximum virtual cores. + final int numYarnMaxVcores = yarnClusterInformationRetriever.getMaxVcores(); + + int configuredAmVcores = flinkConfiguration.getInteger(YarnConfigOptions.APP_MASTER_VCORES); + if (configuredAmVcores > numYarnMaxVcores) { + throw new IllegalConfigurationException(String.format( + "The number of requested virtual cores for application master %d" + + " exceeds the maximum number of virtual cores %d available in the Yarn Cluster.", + configuredAmVcores, numYarnMaxVcores)); + } + + int configuredVcores = + flinkConfiguration.getInteger(YarnConfigOptions.VCORES, clusterSpecification.getSlotsPerTaskManager()); + // don't configure more than the maximum configured number of vcores + if (configuredVcores > numYarnMaxVcores) { + throw new IllegalConfigurationException(String.format( + "The number of requested virtual cores per node %d" + + " exceeds the maximum number of virtual cores %d available in the Yarn Cluster." + + " Please note that the number of virtual cores is set to the number of task slots by default" + + " unless configured in the Flink config with '%s.'", + configuredVcores, numYarnMaxVcores, YarnConfigOptions.VCORES.key())); + } + + // check if required Hadoop environment variables are set. If not, warn user + if (System.getenv("HADOOP_CONF_DIR") == null && System.getenv("YARN_CONF_DIR") == null) { + LOG.warn("Neither the HADOOP_CONF_DIR nor the YARN_CONF_DIR environment variable is set. " + + "The Flink YARN Client needs one of these to be set to properly load the Hadoop " + + "configuration for accessing YARN."); + } + } + + public String getNodeLabel() { + return nodeLabel; + } + + // ------------------------------------------------------------- + // Lifecycle management + // ------------------------------------------------------------- + + @Override + public void close() { + if (!sharedYarnClient) { + yarnClient.stop(); + } + } + + // ------------------------------------------------------------- + // ClusterClient overrides + // ------------------------------------------------------------- + + @Override + public ClusterClientProvider retrieve(ApplicationId applicationId) throws ClusterRetrieveException { + + try { + // check if required Hadoop environment variables are set. If not, warn user + if (System.getenv("HADOOP_CONF_DIR") == null && System.getenv("YARN_CONF_DIR") == null) { + LOG.warn("Neither the HADOOP_CONF_DIR nor the YARN_CONF_DIR environment variable is set." + + "The Flink YARN Client needs one of these to be set to properly load the Hadoop " + + "configuration for accessing YARN."); + } + + final ApplicationReport report = yarnClient.getApplicationReport(applicationId); + + if (report.getFinalApplicationStatus() != FinalApplicationStatus.UNDEFINED) { + // Flink cluster is not running anymore + LOG.error( + "The application {} doesn't run anymore. It has previously completed with final status: {}", + applicationId, + report.getFinalApplicationStatus()); + throw new RuntimeException("The Yarn application " + applicationId + " doesn't run anymore."); + } + + setClusterEntrypointInfoToConfig(report); + + return () -> { + try { + return new RestClusterClient<>(flinkConfiguration, report.getApplicationId()); + } catch (Exception e) { + throw new RuntimeException("Couldn't retrieve Yarn cluster", e); + } + }; + } catch (Exception e) { + throw new ClusterRetrieveException("Couldn't retrieve Yarn cluster", e); + } + } + + @Override + public ClusterClientProvider deploySessionCluster(ClusterSpecification clusterSpecification) + throws ClusterDeploymentException { + try { + return deployInternal( + clusterSpecification, "Flink session cluster", getYarnSessionClusterEntrypoint(), null, false); + } catch (Exception e) { + throw new ClusterDeploymentException("Couldn't deploy Yarn session cluster", e); + } + } + + @Override + public ClusterClientProvider deployApplicationCluster( + final ClusterSpecification clusterSpecification, final ApplicationConfiguration applicationConfiguration) + throws ClusterDeploymentException { + checkNotNull(clusterSpecification); + checkNotNull(applicationConfiguration); + + final YarnDeploymentTarget deploymentTarget = YarnDeploymentTarget.fromConfig(flinkConfiguration); + if (YarnDeploymentTarget.APPLICATION != deploymentTarget) { + throw new ClusterDeploymentException("Couldn't deploy Yarn Application Cluster." + + " Expected deployment.target=" + + YarnDeploymentTarget.APPLICATION.getName() + + " but actual one was \"" + + deploymentTarget.getName() + + "\""); + } + + applicationConfiguration.applyToConfiguration(flinkConfiguration); + + // No need to do pipelineJars validation if it is a PyFlink job. + if (!(PackagedProgramUtils.isPython(applicationConfiguration.getApplicationClassName()) + || PackagedProgramUtils.isPython(applicationConfiguration.getProgramArguments()))) { + final List pipelineJars = + flinkConfiguration.getOptional(PipelineOptions.JARS).orElse(Collections.emptyList()); + Preconditions.checkArgument(pipelineJars.size() == 1, "Should only have one jar"); + } + + try { + return deployInternal( + clusterSpecification, + "Flink Application Cluster", + YarnApplicationClusterEntryPoint.class.getName(), + null, + false); + } catch (Exception e) { + throw new ClusterDeploymentException("Couldn't deploy Yarn Application Cluster", e); + } + } + + @Override + public ClusterClientProvider deployJobCluster( + ClusterSpecification clusterSpecification, JobGraph jobGraph, boolean detached) + throws ClusterDeploymentException { + + LOG.warn( + "Job Clusters are deprecated since Flink 1.15. Please use an Application Cluster/Application Mode instead."); + try { + return deployInternal( + clusterSpecification, "Flink per-job cluster", getYarnJobClusterEntrypoint(), jobGraph, detached); + } catch (Exception e) { + throw new ClusterDeploymentException("Could not deploy Yarn job cluster.", e); + } + } + + @Override + public void killCluster(ApplicationId applicationId) throws FlinkException { + try { + yarnClient.killApplication(applicationId); + + try (final FileSystem fs = FileSystem.get(yarnConfiguration)) { + final Path applicationDir = + YarnApplicationFileUploader.getApplicationDirPath(getStagingDir(fs), applicationId); + + Utils.deleteApplicationFiles(applicationDir.toUri().toString()); + } + + } catch (YarnException | IOException e) { + throw new FlinkException("Could not kill the Yarn Flink cluster with id " + applicationId + '.', e); + } + } + + /** + * This method will block until the ApplicationMaster/JobManager have been deployed on YARN. + * + * @param clusterSpecification Initial cluster specification for the Flink cluster to be + * deployed + * @param applicationName name of the Yarn application to start + * @param yarnClusterEntrypoint Class name of the Yarn cluster entry point. + * @param jobGraph A job graph which is deployed with the Flink cluster, {@code null} if none + * @param detached True if the cluster should be started in detached mode + */ + private ClusterClientProvider deployInternal( + ClusterSpecification clusterSpecification, + String applicationName, + String yarnClusterEntrypoint, + @Nullable JobGraph jobGraph, + boolean detached) + throws Exception { + + final UserGroupInformation currentUser = UserGroupInformation.getCurrentUser(); + if (HadoopUtils.isKerberosSecurityEnabled(currentUser)) { + boolean useTicketCache = flinkConfiguration.getBoolean(SecurityOptions.KERBEROS_LOGIN_USETICKETCACHE); + + if (!HadoopUtils.areKerberosCredentialsValid(currentUser, useTicketCache)) { + throw new RuntimeException("Hadoop security with Kerberos is enabled but the login user " + + "does not have Kerberos credentials or delegation tokens!"); + } + + final boolean fetchToken = flinkConfiguration.getBoolean(SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN); + final boolean yarnAccessFSEnabled = !CollectionUtil.isNullOrEmpty( + flinkConfiguration.get(SecurityOptions.KERBEROS_HADOOP_FILESYSTEMS_TO_ACCESS)); + if (!fetchToken && yarnAccessFSEnabled) { + throw new IllegalConfigurationException(String.format( + "When %s is disabled, %s must be disabled as well.", + SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN.key(), + SecurityOptions.KERBEROS_HADOOP_FILESYSTEMS_TO_ACCESS.key())); + } + } + + isReadyForDeployment(clusterSpecification); + + // ------------------ Check if the specified queue exists -------------------- + + checkYarnQueues(yarnClient); + + // ------------------ Check if the YARN ClusterClient has the requested resources + // -------------- + + // Create application via yarnClient + final YarnClientApplication yarnApplication = yarnClient.createApplication(); + final GetNewApplicationResponse appResponse = yarnApplication.getNewApplicationResponse(); + + Resource maxRes = appResponse.getMaximumResourceCapability(); + + final ClusterResourceDescription freeClusterMem; + try { + freeClusterMem = getCurrentFreeClusterResources(yarnClient); + } catch (YarnException | IOException e) { + failSessionDuringDeployment(yarnClient, yarnApplication); + throw new YarnDeploymentException("Could not retrieve information about free cluster resources.", e); + } + + final int yarnMinAllocationMB = yarnConfiguration.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB); + if (yarnMinAllocationMB <= 0) { + throw new YarnDeploymentException("The minimum allocation memory " + + "(" + + yarnMinAllocationMB + + " MB) configured via '" + + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB + + "' should be greater than 0."); + } + + final ClusterSpecification validClusterSpecification; + try { + validClusterSpecification = + validateClusterResources(clusterSpecification, yarnMinAllocationMB, maxRes, freeClusterMem); + } catch (YarnDeploymentException yde) { + failSessionDuringDeployment(yarnClient, yarnApplication); + throw yde; + } + + LOG.info("Cluster specification: {}", validClusterSpecification); + + final ClusterEntrypoint.ExecutionMode executionMode = + detached ? ClusterEntrypoint.ExecutionMode.DETACHED : ClusterEntrypoint.ExecutionMode.NORMAL; + + flinkConfiguration.setString(ClusterEntrypoint.INTERNAL_CLUSTER_EXECUTION_MODE, executionMode.toString()); + + ApplicationReport report = startAppMaster( + flinkConfiguration, + applicationName, + yarnClusterEntrypoint, + jobGraph, + yarnClient, + yarnApplication, + validClusterSpecification); + + // print the application id for user to cancel themselves. + if (detached) { + final ApplicationId yarnApplicationId = report.getApplicationId(); + logDetachedClusterInformation(yarnApplicationId, LOG); + } + + setClusterEntrypointInfoToConfig(report); + + return () -> { + try { + return new RestClusterClient<>(flinkConfiguration, report.getApplicationId()); + } catch (Exception e) { + throw new RuntimeException("Error while creating RestClusterClient.", e); + } + }; + } + + private ClusterSpecification validateClusterResources( + ClusterSpecification clusterSpecification, + int yarnMinAllocationMB, + Resource maximumResourceCapability, + ClusterResourceDescription freeClusterResources) + throws YarnDeploymentException { + + int jobManagerMemoryMb = clusterSpecification.getMasterMemoryMB(); + final int taskManagerMemoryMb = clusterSpecification.getTaskManagerMemoryMB(); + + logIfComponentMemNotIntegerMultipleOfYarnMinAllocation("JobManager", jobManagerMemoryMb, yarnMinAllocationMB); + logIfComponentMemNotIntegerMultipleOfYarnMinAllocation("TaskManager", taskManagerMemoryMb, yarnMinAllocationMB); + + // set the memory to minAllocationMB to do the next checks correctly + if (jobManagerMemoryMb < yarnMinAllocationMB) { + jobManagerMemoryMb = yarnMinAllocationMB; + } + + final String note = + "Please check the 'yarn.scheduler.maximum-allocation-mb' and the 'yarn.nodemanager.resource.memory-mb' configuration values\n"; + if (jobManagerMemoryMb > maximumResourceCapability.getMemory()) { + throw new YarnDeploymentException( + "The cluster does not have the requested resources for the JobManager available!\n" + + "Maximum Memory: " + + maximumResourceCapability.getMemory() + + "MB Requested: " + + jobManagerMemoryMb + + "MB. " + + note); + } + + if (taskManagerMemoryMb > maximumResourceCapability.getMemory()) { + throw new YarnDeploymentException( + "The cluster does not have the requested resources for the TaskManagers available!\n" + + "Maximum Memory: " + + maximumResourceCapability.getMemory() + + " Requested: " + + taskManagerMemoryMb + + "MB. " + + note); + } + + final String noteRsc = + "\nThe Flink YARN client will try to allocate the YARN session, but maybe not all TaskManagers are " + + "connecting from the beginning because the resources are currently not available in the cluster. " + + "The allocation might take more time than usual because the Flink YARN client needs to wait until " + + "the resources become available."; + + if (taskManagerMemoryMb > freeClusterResources.containerLimit) { + LOG.warn("The requested amount of memory for the TaskManagers (" + + taskManagerMemoryMb + + "MB) is more than " + + "the largest possible YARN container: " + + freeClusterResources.containerLimit + + noteRsc); + } + if (jobManagerMemoryMb > freeClusterResources.containerLimit) { + LOG.warn("The requested amount of memory for the JobManager (" + + jobManagerMemoryMb + + "MB) is more than " + + "the largest possible YARN container: " + + freeClusterResources.containerLimit + + noteRsc); + } + + return new ClusterSpecification.ClusterSpecificationBuilder() + .setMasterMemoryMB(jobManagerMemoryMb) + .setTaskManagerMemoryMB(taskManagerMemoryMb) + .setSlotsPerTaskManager(clusterSpecification.getSlotsPerTaskManager()) + .createClusterSpecification(); + } + + private void logIfComponentMemNotIntegerMultipleOfYarnMinAllocation( + String componentName, int componentMemoryMB, int yarnMinAllocationMB) { + int normalizedMemMB = + (componentMemoryMB + (yarnMinAllocationMB - 1)) / yarnMinAllocationMB * yarnMinAllocationMB; + if (normalizedMemMB <= 0) { + normalizedMemMB = yarnMinAllocationMB; + } + if (componentMemoryMB != normalizedMemMB) { + LOG.info( + "The configured {} memory is {} MB. YARN will allocate {} MB to make up an integer multiple of its " + + "minimum allocation memory ({} MB, configured via 'yarn.scheduler.minimum-allocation-mb'). The extra {} MB " + + "may not be used by Flink.", + componentName, + componentMemoryMB, + normalizedMemMB, + yarnMinAllocationMB, + normalizedMemMB - componentMemoryMB); + } + } + + private void checkYarnQueues(YarnClient yarnClient) { + try { + List queues = yarnClient.getAllQueues(); + if (queues.size() > 0 + && this.yarnQueue != null) { // check only if there are queues configured in yarn and for + // this session. + boolean queueFound = false; + for (QueueInfo queue : queues) { + if (queue.getQueueName().equals(this.yarnQueue) + || queue.getQueueName().equals("root." + this.yarnQueue)) { + queueFound = true; + break; + } + } + if (!queueFound) { + String queueNames = StringUtils.toQuotedListString(queues.toArray()); + LOG.warn("The specified queue '" + + this.yarnQueue + + "' does not exist. " + + "Available queues: " + + queueNames); + } + } else { + LOG.debug("The YARN cluster does not have any queues configured"); + } + } catch (Throwable e) { + LOG.warn("Error while getting queue information from YARN: " + e.getMessage()); + if (LOG.isDebugEnabled()) { + LOG.debug("Error details", e); + } + } + } + + private ApplicationReport startAppMaster( + Configuration configuration, + String applicationName, + String yarnClusterEntrypoint, + JobGraph jobGraph, + YarnClient yarnClient, + YarnClientApplication yarnApplication, + ClusterSpecification clusterSpecification) + throws Exception { + + // ------------------ Initialize the file systems ------------------------- + + org.apache.flink.core.fs.FileSystem.initialize( + configuration, PluginUtils.createPluginManagerFromRootFolder(configuration)); + + final FileSystem fs = FileSystem.get(yarnConfiguration); + + // hard coded check for the GoogleHDFS client because its not overriding the getScheme() + // method. + if (!fs.getClass().getSimpleName().equals("GoogleHadoopFileSystem") + && fs.getScheme().startsWith("file")) { + LOG.warn("The file system scheme is '" + + fs.getScheme() + + "'. This indicates that the " + + "specified Hadoop configuration path is wrong and the system is using the default Hadoop configuration values." + + "The Flink YARN client needs to store its files in a distributed file system"); + } + + ApplicationSubmissionContext appContext = yarnApplication.getApplicationSubmissionContext(); + + final List providedLibDirs = Utils.getQualifiedRemoteProvidedLibDirs(configuration, yarnConfiguration); + + final Optional providedUsrLibDir = + Utils.getQualifiedRemoteProvidedUsrLib(configuration, yarnConfiguration); + + Path stagingDirPath = getStagingDir(fs); + FileSystem stagingDirFs = stagingDirPath.getFileSystem(yarnConfiguration); + final YarnApplicationFileUploader fileUploader = YarnApplicationFileUploader.from( + stagingDirFs, stagingDirPath, providedLibDirs, appContext.getApplicationId(), getFileReplication()); + + // The files need to be shipped and added to classpath. + Set systemShipFiles = new HashSet<>(shipFiles.size()); + for (File file : shipFiles) { + systemShipFiles.add(file.getAbsoluteFile()); + } + + final String logConfigFilePath = configuration.getString(YarnConfigOptionsInternal.APPLICATION_LOG_CONFIG_FILE); + if (logConfigFilePath != null) { + systemShipFiles.add(new File(logConfigFilePath)); + } + + // Set-up ApplicationSubmissionContext for the application + + final ApplicationId appId = appContext.getApplicationId(); + + // ------------------ Add Zookeeper namespace to local flinkConfiguraton ------ + setHAClusterIdIfNotSet(configuration, appId); + + if (HighAvailabilityMode.isHighAvailabilityModeActivated(configuration)) { + // activate re-execution of failed applications + appContext.setMaxAppAttempts(configuration.getInteger( + YarnConfigOptions.APPLICATION_ATTEMPTS.key(), YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)); + + activateHighAvailabilitySupport(appContext); + } else { + // set number of application retries to 1 in the default case + appContext.setMaxAppAttempts(configuration.getInteger(YarnConfigOptions.APPLICATION_ATTEMPTS.key(), 1)); + } + + final Set userJarFiles = new HashSet<>(); + if (jobGraph != null) { + userJarFiles.addAll(jobGraph.getUserJars().stream() + .map(f -> f.toUri()) + .map(Path::new) + .collect(Collectors.toSet())); + } + + final List jarUrls = ConfigUtils.decodeListFromConfig(configuration, PipelineOptions.JARS, URI::create); + if (jarUrls != null && YarnApplicationClusterEntryPoint.class.getName().equals(yarnClusterEntrypoint)) { + userJarFiles.addAll(jarUrls.stream().map(Path::new).collect(Collectors.toSet())); + } + + // only for per job mode + if (jobGraph != null) { + for (Map.Entry entry : + jobGraph.getUserArtifacts().entrySet()) { + // only upload local files + if (!Utils.isRemotePath(entry.getValue().filePath)) { + Path localPath = new Path(entry.getValue().filePath); + Tuple2 remoteFileInfo = fileUploader.uploadLocalFileToRemote(localPath, entry.getKey()); + jobGraph.setUserArtifactRemotePath(entry.getKey(), remoteFileInfo.f0.toString()); + } + } + + jobGraph.writeUserArtifactEntriesToConfiguration(); + } + + if (providedLibDirs == null || providedLibDirs.isEmpty()) { + addLibFoldersToShipFiles(systemShipFiles); + } + + // Register all files in provided lib dirs as local resources with public visibility + // and upload the remaining dependencies as local resources with APPLICATION visibility. + final List systemClassPaths = fileUploader.registerProvidedLocalResources(); + final List uploadedDependencies = fileUploader.registerMultipleLocalResources( + systemShipFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.FILE); + systemClassPaths.addAll(uploadedDependencies); + + // upload and register ship-only files + // Plugin files only need to be shipped and should not be added to classpath. + if (providedLibDirs == null || providedLibDirs.isEmpty()) { + Set shipOnlyFiles = new HashSet<>(); + addPluginsFoldersToShipFiles(shipOnlyFiles); + fileUploader.registerMultipleLocalResources( + shipOnlyFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.FILE); + } + + if (!shipArchives.isEmpty()) { + fileUploader.registerMultipleLocalResources( + shipArchives.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.ARCHIVE); + } + + // only for application mode + // Python jar file only needs to be shipped and should not be added to classpath. + if (YarnApplicationClusterEntryPoint.class.getName().equals(yarnClusterEntrypoint) + && PackagedProgramUtils.isPython(configuration.get(APPLICATION_MAIN_CLASS))) { + fileUploader.registerMultipleLocalResources( + Collections.singletonList( + new Path(PackagedProgramUtils.getPythonJar().toURI())), + ConfigConstants.DEFAULT_FLINK_OPT_DIR, + LocalResourceType.FILE); + } + + // Upload and register user jars + final List userClassPaths = fileUploader.registerMultipleLocalResources( + userJarFiles, + userJarInclusion == YarnConfigOptions.UserJarInclusion.DISABLED + ? ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR + : Path.CUR_DIR, + LocalResourceType.FILE); + + // usrlib in remote will be used first. + if (providedUsrLibDir.isPresent()) { + final List usrLibClassPaths = fileUploader.registerMultipleLocalResources( + Collections.singletonList(providedUsrLibDir.get()), Path.CUR_DIR, LocalResourceType.FILE); + userClassPaths.addAll(usrLibClassPaths); + } else if (ClusterEntrypointUtils.tryFindUserLibDirectory().isPresent()) { + // local usrlib will be automatically shipped if it exists and there is no remote + // usrlib. + final Set usrLibShipFiles = new HashSet<>(); + addUsrLibFolderToShipFiles(usrLibShipFiles); + final List usrLibClassPaths = fileUploader.registerMultipleLocalResources( + usrLibShipFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.FILE); + userClassPaths.addAll(usrLibClassPaths); + } + + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.ORDER) { + systemClassPaths.addAll(userClassPaths); + } + + // normalize classpath by sorting + Collections.sort(systemClassPaths); + Collections.sort(userClassPaths); + + // classpath assembler + StringBuilder classPathBuilder = new StringBuilder(); + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.FIRST) { + for (String userClassPath : userClassPaths) { + classPathBuilder.append(userClassPath).append(pathSeparator); + } + } + for (String classPath : systemClassPaths) { + classPathBuilder.append(classPath).append(pathSeparator); + } + + // Setup jar for ApplicationMaster + final YarnLocalResourceDescriptor localResourceDescFlinkJar = fileUploader.uploadFlinkDist(flinkJarPath); + classPathBuilder.append(localResourceDescFlinkJar.getResourceKey()).append(pathSeparator); + + // write job graph to tmp file and add it to local resource + // TODO: server use user main method to generate job graph + if (jobGraph != null) { + File tmpJobGraphFile = null; + try { + tmpJobGraphFile = File.createTempFile(appId.toString(), null); + try (FileOutputStream output = new FileOutputStream(tmpJobGraphFile); + ObjectOutputStream obOutput = new ObjectOutputStream(output)) { + obOutput.writeObject(jobGraph); + } + + final String jobGraphFilename = "job.graph"; + configuration.setString(JOB_GRAPH_FILE_PATH, jobGraphFilename); + + fileUploader.registerSingleLocalResource( + jobGraphFilename, new Path(tmpJobGraphFile.toURI()), "", LocalResourceType.FILE, true, false); + classPathBuilder.append(jobGraphFilename).append(pathSeparator); + } catch (Exception e) { + LOG.warn("Add job graph to local resource fail."); + throw e; + } finally { + if (tmpJobGraphFile != null && !tmpJobGraphFile.delete()) { + LOG.warn("Fail to delete temporary file {}.", tmpJobGraphFile.toPath()); + } + } + } + + // Upload the flink configuration + // write out configuration file + File tmpConfigurationFile = null; + try { + tmpConfigurationFile = File.createTempFile(appId + "-flink-conf.yaml", null); + + // remove localhost bind hosts as they render production clusters unusable + removeLocalhostBindHostSetting(configuration, JobManagerOptions.BIND_HOST); + removeLocalhostBindHostSetting(configuration, TaskManagerOptions.BIND_HOST); + // this setting is unconditionally overridden anyway, so we remove it for clarity + configuration.removeConfig(TaskManagerOptions.HOST); + + BootstrapTools.writeConfiguration(configuration, tmpConfigurationFile); + + String flinkConfigKey = "flink-conf.yaml"; + fileUploader.registerSingleLocalResource( + flinkConfigKey, + new Path(tmpConfigurationFile.getAbsolutePath()), + "", + LocalResourceType.FILE, + true, + true); + classPathBuilder.append("flink-conf.yaml").append(pathSeparator); + } finally { + if (tmpConfigurationFile != null && !tmpConfigurationFile.delete()) { + LOG.warn("Fail to delete temporary file {}.", tmpConfigurationFile.toPath()); + } + } + + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.LAST) { + for (String userClassPath : userClassPaths) { + classPathBuilder.append(userClassPath).append(pathSeparator); + } + } + + // To support Yarn Secure Integration Test Scenario + // In Integration test setup, the Yarn containers created by YarnMiniCluster does not have + // the Yarn site XML + // and KRB5 configuration files. We are adding these files as container local resources for + // the container + // applications (JM/TMs) to have proper secure cluster setup + Path remoteYarnSiteXmlPath = null; + if (System.getenv("IN_TESTS") != null) { + File f = new File(System.getenv("YARN_CONF_DIR"), Utils.YARN_SITE_FILE_NAME); + LOG.info("Adding Yarn configuration {} to the AM container local resource bucket", f.getAbsolutePath()); + Path yarnSitePath = new Path(f.getAbsolutePath()); + remoteYarnSiteXmlPath = fileUploader + .registerSingleLocalResource( + Utils.YARN_SITE_FILE_NAME, yarnSitePath, "", LocalResourceType.FILE, false, false) + .getPath(); + if (System.getProperty("java.security.krb5.conf") != null) { + configuration.set(SecurityOptions.KERBEROS_KRB5_PATH, System.getProperty("java.security.krb5.conf")); + } + } + + Path remoteKrb5Path = null; + boolean hasKrb5 = false; + String krb5Config = configuration.get(SecurityOptions.KERBEROS_KRB5_PATH); + if (!StringUtils.isNullOrWhitespaceOnly(krb5Config)) { + final File krb5 = new File(krb5Config); + LOG.info("Adding KRB5 configuration {} to the AM container local resource bucket", krb5.getAbsolutePath()); + final Path krb5ConfPath = new Path(krb5.getAbsolutePath()); + remoteKrb5Path = fileUploader + .registerSingleLocalResource( + Utils.KRB5_FILE_NAME, krb5ConfPath, "", LocalResourceType.FILE, false, false) + .getPath(); + hasKrb5 = true; + } + + Path remotePathKeytab = null; + String localizedKeytabPath = null; + String keytab = configuration.getString(SecurityOptions.KERBEROS_LOGIN_KEYTAB); + if (keytab != null) { + boolean localizeKeytab = flinkConfiguration.getBoolean(YarnConfigOptions.SHIP_LOCAL_KEYTAB); + localizedKeytabPath = flinkConfiguration.getString(YarnConfigOptions.LOCALIZED_KEYTAB_PATH); + if (localizeKeytab) { + // Localize the keytab to YARN containers via local resource. + LOG.info("Adding keytab {} to the AM container local resource bucket", keytab); + remotePathKeytab = fileUploader + .registerSingleLocalResource( + localizedKeytabPath, new Path(keytab), "", LocalResourceType.FILE, false, false) + .getPath(); + } else { + // // Assume Keytab is pre-installed in the container. + localizedKeytabPath = flinkConfiguration.getString(YarnConfigOptions.LOCALIZED_KEYTAB_PATH); + } + } + + final JobManagerProcessSpec processSpec = + JobManagerProcessUtils.processSpecFromConfigWithNewOptionToInterpretLegacyHeap( + flinkConfiguration, JobManagerOptions.TOTAL_PROCESS_MEMORY); + final ContainerLaunchContext amContainer = + setupApplicationMasterContainer(yarnClusterEntrypoint, hasKrb5, processSpec); + + // New delegation token framework + if (configuration.getBoolean(SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN)) { + setTokensFor(amContainer); + } + // Old delegation token framework + if (UserGroupInformation.isSecurityEnabled()) { + LOG.info("Adding delegation token to the AM container."); + final List pathsToObtainToken = new ArrayList<>(); + boolean fetchToken = configuration.getBoolean(SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN); + if (fetchToken) { + List yarnAccessList = ConfigUtils.decodeListFromConfig( + configuration, SecurityOptions.KERBEROS_HADOOP_FILESYSTEMS_TO_ACCESS, Path::new); + pathsToObtainToken.addAll(yarnAccessList); + pathsToObtainToken.addAll(fileUploader.getRemotePaths()); + } + Utils.setTokensFor(amContainer, pathsToObtainToken, yarnConfiguration, fetchToken); + } + + amContainer.setLocalResources(fileUploader.getRegisteredLocalResources()); + fileUploader.close(); + + // Setup CLASSPATH and environment variables for ApplicationMaster + final Map appMasterEnv = generateApplicationMasterEnv( + fileUploader, classPathBuilder.toString(), localResourceDescFlinkJar.toString(), appId.toString()); + + if (localizedKeytabPath != null) { + appMasterEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localizedKeytabPath); + String principal = configuration.getString(SecurityOptions.KERBEROS_LOGIN_PRINCIPAL); + appMasterEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, principal); + if (remotePathKeytab != null) { + appMasterEnv.put(YarnConfigKeys.REMOTE_KEYTAB_PATH, remotePathKeytab.toString()); + } + } + + // To support Yarn Secure Integration Test Scenario + if (remoteYarnSiteXmlPath != null) { + appMasterEnv.put(YarnConfigKeys.ENV_YARN_SITE_XML_PATH, remoteYarnSiteXmlPath.toString()); + } + if (remoteKrb5Path != null) { + appMasterEnv.put(YarnConfigKeys.ENV_KRB5_PATH, remoteKrb5Path.toString()); + } + + amContainer.setEnvironment(appMasterEnv); + + // Set up resource type requirements for ApplicationMaster + Resource capability = Records.newRecord(Resource.class); + capability.setMemory(clusterSpecification.getMasterMemoryMB()); + capability.setVirtualCores(flinkConfiguration.getInteger(YarnConfigOptions.APP_MASTER_VCORES)); + + final String customApplicationName = customName != null ? customName : applicationName; + + appContext.setApplicationName(customApplicationName); + appContext.setApplicationType(applicationType != null ? applicationType : "Apache Flink"); + appContext.setAMContainerSpec(amContainer); + appContext.setResource(capability); + + // Set priority for application + int priorityNum = flinkConfiguration.getInteger(YarnConfigOptions.APPLICATION_PRIORITY); + if (priorityNum >= 0) { + Priority priority = Priority.newInstance(priorityNum); + appContext.setPriority(priority); + } + + if (yarnQueue != null) { + appContext.setQueue(yarnQueue); + } + + setApplicationNodeLabel(appContext); + + setApplicationTags(appContext); + + // add a hook to clean up in case deployment fails + Thread deploymentFailureHook = new DeploymentFailureHook(yarnApplication, fileUploader.getApplicationDir()); + Runtime.getRuntime().addShutdownHook(deploymentFailureHook); + LOG.info("Submitting application master " + appId); + yarnClient.submitApplication(appContext); + + LOG.info("Waiting for the cluster to be allocated"); + final long startTime = System.currentTimeMillis(); + ApplicationReport report; + YarnApplicationState lastAppState = YarnApplicationState.NEW; + loop: + while (true) { + try { + report = yarnClient.getApplicationReport(appId); + } catch (IOException e) { + throw new YarnDeploymentException("Failed to deploy the cluster.", e); + } + YarnApplicationState appState = report.getYarnApplicationState(); + LOG.debug("Application State: {}", appState); + switch (appState) { + case FAILED: + case KILLED: + throw new YarnDeploymentException("The YARN application unexpectedly switched to state " + + appState + + " during deployment. \n" + + "Diagnostics from YARN: " + + report.getDiagnostics() + + "\n" + + "If log aggregation is enabled on your cluster, use this command to further investigate the issue:\n" + + "yarn logs -applicationId " + + appId); + // break .. + case RUNNING: + LOG.info("YARN application has been deployed successfully."); + break loop; + case FINISHED: + LOG.info("YARN application has been finished successfully."); + break loop; + default: + if (appState != lastAppState) { + LOG.info("Deploying cluster, current state " + appState); + } + if (System.currentTimeMillis() - startTime > 60000) { + LOG.info( + "Deployment took more than 60 seconds. Please check if the requested resources are available in the YARN cluster"); + } + } + lastAppState = appState; + Thread.sleep(250); + } + + // since deployment was successful, remove the hook + ShutdownHookUtil.removeShutdownHook(deploymentFailureHook, getClass().getSimpleName(), LOG); + return report; + } + + private void removeLocalhostBindHostSetting(Configuration configuration, ConfigOption option) { + configuration + .getOptional(option) + .filter(bindHost -> bindHost.equals("localhost")) + .ifPresent(bindHost -> { + LOG.info( + "Removing 'localhost' {} setting from effective configuration; using '0.0.0.0' instead.", + option); + configuration.removeConfig(option); + }); + } + + private void setTokensFor(ContainerLaunchContext containerLaunchContext) throws Exception { + LOG.info("Adding delegation tokens to the AM container."); + + Credentials credentials = new Credentials(); + + DelegationTokenManager delegationTokenManager = + new KerberosDelegationTokenManager(flinkConfiguration, null, null); + delegationTokenManager.obtainDelegationTokens(credentials); + + ByteBuffer tokens = ByteBuffer.wrap(DelegationTokenConverter.serialize(credentials)); + containerLaunchContext.setTokens(tokens); + + LOG.info("Delegation tokens added to the AM container."); + } + + /** + * Returns the configured remote target home directory if set, otherwise returns the default + * home directory. + * + * @param defaultFileSystem default file system used + * @return the remote target home directory + */ + @VisibleForTesting + Path getStagingDir(FileSystem defaultFileSystem) throws IOException { + final String configuredStagingDir = flinkConfiguration.getString(YarnConfigOptions.STAGING_DIRECTORY); + if (configuredStagingDir == null) { + return defaultFileSystem.getHomeDirectory(); + } + FileSystem stagingDirFs = new Path(configuredStagingDir).getFileSystem(defaultFileSystem.getConf()); + return stagingDirFs.makeQualified(new Path(configuredStagingDir)); + } + + private int getFileReplication() { + final int yarnFileReplication = + yarnConfiguration.getInt(DFSConfigKeys.DFS_REPLICATION_KEY, DFSConfigKeys.DFS_REPLICATION_DEFAULT); + final int fileReplication = flinkConfiguration.getInteger(YarnConfigOptions.FILE_REPLICATION); + return fileReplication > 0 ? fileReplication : yarnFileReplication; + } + + private static String encodeYarnLocalResourceDescriptorListToString(List resources) { + return String.join( + LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR, + resources.stream().map(YarnLocalResourceDescriptor::toString).collect(Collectors.toList())); + } + + /** + * Kills YARN application and stops YARN client. + * + *

Use this method to kill the App before it has been properly deployed + */ + private void failSessionDuringDeployment(YarnClient yarnClient, YarnClientApplication yarnApplication) { + LOG.info("Killing YARN application"); + + try { + yarnClient.killApplication( + yarnApplication.getNewApplicationResponse().getApplicationId()); + } catch (Exception e) { + // we only log a debug message here because the "killApplication" call is a best-effort + // call (we don't know if the application has been deployed when the error occurred). + LOG.debug("Error while killing YARN application", e); + } + } + + private static class ClusterResourceDescription { + public final int totalFreeMemory; + public final int containerLimit; + public final int[] nodeManagersFree; + + public ClusterResourceDescription(int totalFreeMemory, int containerLimit, int[] nodeManagersFree) { + this.totalFreeMemory = totalFreeMemory; + this.containerLimit = containerLimit; + this.nodeManagersFree = nodeManagersFree; + } + } + + private ClusterResourceDescription getCurrentFreeClusterResources(YarnClient yarnClient) + throws YarnException, IOException { + List nodes = yarnClient.getNodeReports(NodeState.RUNNING); + + int totalFreeMemory = 0; + int containerLimit = 0; + int[] nodeManagersFree = new int[nodes.size()]; + + for (int i = 0; i < nodes.size(); i++) { + NodeReport rep = nodes.get(i); + int free = rep.getCapability().getMemory() + - (rep.getUsed() != null ? rep.getUsed().getMemory() : 0); + nodeManagersFree[i] = free; + totalFreeMemory += free; + if (free > containerLimit) { + containerLimit = free; + } + } + return new ClusterResourceDescription(totalFreeMemory, containerLimit, nodeManagersFree); + } + + @Override + public String getClusterDescription() { + + try { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + PrintStream ps = new PrintStream(baos); + + YarnClusterMetrics metrics = yarnClient.getYarnClusterMetrics(); + + ps.append("NodeManagers in the ClusterClient " + metrics.getNumNodeManagers()); + List nodes = yarnClient.getNodeReports(NodeState.RUNNING); + final String format = "|%-16s |%-16s %n"; + ps.printf("|Property |Value %n"); + ps.println("+---------------------------------------+"); + int totalMemory = 0; + int totalCores = 0; + for (NodeReport rep : nodes) { + final Resource res = rep.getCapability(); + totalMemory += res.getMemory(); + totalCores += res.getVirtualCores(); + ps.format(format, "NodeID", rep.getNodeId()); + ps.format(format, "Memory", res.getMemory() + " MB"); + ps.format(format, "vCores", res.getVirtualCores()); + ps.format(format, "HealthReport", rep.getHealthReport()); + ps.format(format, "Containers", rep.getNumContainers()); + ps.println("+---------------------------------------+"); + } + ps.println("Summary: totalMemory " + totalMemory + " totalCores " + totalCores); + List qInfo = yarnClient.getAllQueues(); + for (QueueInfo q : qInfo) { + ps.println("Queue: " + + q.getQueueName() + + ", Current Capacity: " + + q.getCurrentCapacity() + + " Max Capacity: " + + q.getMaximumCapacity() + + " Applications: " + + q.getApplications().size()); + } + return baos.toString(); + } catch (Exception e) { + throw new RuntimeException("Couldn't get cluster description", e); + } + } + + private void activateHighAvailabilitySupport(ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + + reflector.setKeepContainersAcrossApplicationAttempts(appContext, true); + + reflector.setAttemptFailuresValidityInterval( + appContext, + flinkConfiguration.getLong(YarnConfigOptions.APPLICATION_ATTEMPT_FAILURE_VALIDITY_INTERVAL)); + } + + private void setApplicationTags(final ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + final ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + final String tagsString = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_TAGS); + + final Set applicationTags = new HashSet<>(); + + // Trim whitespace and cull empty tags + for (final String tag : tagsString.split(",")) { + final String trimmedTag = tag.trim(); + if (!trimmedTag.isEmpty()) { + applicationTags.add(trimmedTag); + } + } + + reflector.setApplicationTags(appContext, applicationTags); + } + + private void setApplicationNodeLabel(final ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + if (nodeLabel != null) { + final ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + reflector.setApplicationNodeLabel(appContext, nodeLabel); + } + } + + /** + * Singleton object which uses reflection to determine whether the {@link + * ApplicationSubmissionContext} supports various methods which, depending on the Hadoop + * version, may or may not be supported. + * + *

If an unsupported method is invoked, nothing happens. + * + *

Currently three methods are proxied: - setApplicationTags (>= 2.4.0) - + * setAttemptFailuresValidityInterval (>= 2.6.0) - setKeepContainersAcrossApplicationAttempts + * (>= 2.4.0) - setNodeLabelExpression (>= 2.6.0) + */ + private static class ApplicationSubmissionContextReflector { + private static final Logger LOG = LoggerFactory.getLogger(ApplicationSubmissionContextReflector.class); + + private static final ApplicationSubmissionContextReflector instance = + new ApplicationSubmissionContextReflector(ApplicationSubmissionContext.class); + + public static ApplicationSubmissionContextReflector getInstance() { + return instance; + } + + private static final String APPLICATION_TAGS_METHOD_NAME = "setApplicationTags"; + private static final String ATTEMPT_FAILURES_METHOD_NAME = "setAttemptFailuresValidityInterval"; + private static final String KEEP_CONTAINERS_METHOD_NAME = "setKeepContainersAcrossApplicationAttempts"; + private static final String NODE_LABEL_EXPRESSION_NAME = "setNodeLabelExpression"; + + private final Method applicationTagsMethod; + private final Method attemptFailuresValidityIntervalMethod; + private final Method keepContainersMethod; + + @Nullable + private final Method nodeLabelExpressionMethod; + + private ApplicationSubmissionContextReflector(Class clazz) { + Method applicationTagsMethod; + Method attemptFailuresValidityIntervalMethod; + Method keepContainersMethod; + Method nodeLabelExpressionMethod; + + try { + // this method is only supported by Hadoop 2.4.0 onwards + applicationTagsMethod = clazz.getMethod(APPLICATION_TAGS_METHOD_NAME, Set.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), APPLICATION_TAGS_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), APPLICATION_TAGS_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + applicationTagsMethod = null; + } + + this.applicationTagsMethod = applicationTagsMethod; + + try { + // this method is only supported by Hadoop 2.6.0 onwards + attemptFailuresValidityIntervalMethod = clazz.getMethod(ATTEMPT_FAILURES_METHOD_NAME, long.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), ATTEMPT_FAILURES_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), ATTEMPT_FAILURES_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + attemptFailuresValidityIntervalMethod = null; + } + + this.attemptFailuresValidityIntervalMethod = attemptFailuresValidityIntervalMethod; + + try { + // this method is only supported by Hadoop 2.4.0 onwards + keepContainersMethod = clazz.getMethod(KEEP_CONTAINERS_METHOD_NAME, boolean.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), KEEP_CONTAINERS_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), KEEP_CONTAINERS_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + keepContainersMethod = null; + } + + this.keepContainersMethod = keepContainersMethod; + + try { + nodeLabelExpressionMethod = clazz.getMethod(NODE_LABEL_EXPRESSION_NAME, String.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), NODE_LABEL_EXPRESSION_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), NODE_LABEL_EXPRESSION_NAME); + nodeLabelExpressionMethod = null; + } + + this.nodeLabelExpressionMethod = nodeLabelExpressionMethod; + } + + public void setApplicationTags(ApplicationSubmissionContext appContext, Set applicationTags) + throws InvocationTargetException, IllegalAccessException { + if (applicationTagsMethod != null) { + LOG.debug( + "Calling method {} of {}.", + applicationTagsMethod.getName(), + appContext.getClass().getCanonicalName()); + applicationTagsMethod.invoke(appContext, applicationTags); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + APPLICATION_TAGS_METHOD_NAME); + } + } + + public void setApplicationNodeLabel(ApplicationSubmissionContext appContext, String nodeLabel) + throws InvocationTargetException, IllegalAccessException { + if (nodeLabelExpressionMethod != null) { + LOG.debug( + "Calling method {} of {}.", + nodeLabelExpressionMethod.getName(), + appContext.getClass().getCanonicalName()); + nodeLabelExpressionMethod.invoke(appContext, nodeLabel); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + NODE_LABEL_EXPRESSION_NAME); + } + } + + public void setAttemptFailuresValidityInterval(ApplicationSubmissionContext appContext, long validityInterval) + throws InvocationTargetException, IllegalAccessException { + if (attemptFailuresValidityIntervalMethod != null) { + LOG.debug( + "Calling method {} of {}.", + attemptFailuresValidityIntervalMethod.getName(), + appContext.getClass().getCanonicalName()); + attemptFailuresValidityIntervalMethod.invoke(appContext, validityInterval); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + ATTEMPT_FAILURES_METHOD_NAME); + } + } + + public void setKeepContainersAcrossApplicationAttempts( + ApplicationSubmissionContext appContext, boolean keepContainers) + throws InvocationTargetException, IllegalAccessException { + + if (keepContainersMethod != null) { + LOG.debug( + "Calling method {} of {}.", + keepContainersMethod.getName(), + appContext.getClass().getCanonicalName()); + keepContainersMethod.invoke(appContext, keepContainers); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + KEEP_CONTAINERS_METHOD_NAME); + } + } + } + + private static class YarnDeploymentException extends RuntimeException { + private static final long serialVersionUID = -812040641215388943L; + + public YarnDeploymentException(String message) { + super(message); + } + + public YarnDeploymentException(String message, Throwable cause) { + super(message, cause); + } + } + + private class DeploymentFailureHook extends Thread { + + private final YarnClient yarnClient; + private final YarnClientApplication yarnApplication; + private final Path yarnFilesDir; + + DeploymentFailureHook(YarnClientApplication yarnApplication, Path yarnFilesDir) { + this.yarnApplication = Preconditions.checkNotNull(yarnApplication); + this.yarnFilesDir = Preconditions.checkNotNull(yarnFilesDir); + + // A new yarn client need to be created in shutdown hook in order to avoid + // the yarn client has been closed by YarnClusterDescriptor. + this.yarnClient = YarnClient.createYarnClient(); + this.yarnClient.init(yarnConfiguration); + } + + @Override + public void run() { + LOG.info("Cancelling deployment from Deployment Failure Hook"); + yarnClient.start(); + failSessionDuringDeployment(yarnClient, yarnApplication); + yarnClient.stop(); + LOG.info("Deleting files in {}.", yarnFilesDir); + try { + FileSystem fs = FileSystem.get(yarnConfiguration); + + if (!fs.delete(yarnFilesDir, true)) { + throw new IOException("Deleting files in " + yarnFilesDir + " was unsuccessful"); + } + + fs.close(); + } catch (IOException e) { + LOG.error("Failed to delete Flink Jar and configuration files in HDFS", e); + } + } + } + + @VisibleForTesting + void addLibFoldersToShipFiles(Collection effectiveShipFiles) { + // Add lib folder to the ship files if the environment variable is set. + // This is for convenience when running from the command-line. + // (for other files users explicitly set the ship files) + String libDir = System.getenv().get(ENV_FLINK_LIB_DIR); + if (libDir != null) { + File directoryFile = new File(libDir); + if (directoryFile.isDirectory()) { + effectiveShipFiles.add(directoryFile); + } else { + throw new YarnDeploymentException("The environment variable '" + + ENV_FLINK_LIB_DIR + + "' is set to '" + + libDir + + "' but the directory doesn't exist."); + } + } else if (shipFiles.isEmpty()) { + LOG.warn( + "Environment variable '{}' not set and ship files have not been provided manually. " + + "Not shipping any library files.", + ENV_FLINK_LIB_DIR); + } + } + + @VisibleForTesting + void addUsrLibFolderToShipFiles(Collection effectiveShipFiles) { + // Add usrlib folder to the ship files if it exists + // Classes in the folder will be loaded by UserClassLoader if CLASSPATH_INCLUDE_USER_JAR is + // DISABLED. + ClusterEntrypointUtils.tryFindUserLibDirectory().ifPresent(usrLibDirFile -> { + effectiveShipFiles.add(usrLibDirFile); + LOG.info("usrlib: {} will be shipped automatically.", usrLibDirFile.getAbsolutePath()); + }); + } + + @VisibleForTesting + void addPluginsFoldersToShipFiles(Collection effectiveShipFiles) { + final Optional pluginsDir = PluginConfig.getPluginsDir(); + pluginsDir.ifPresent(effectiveShipFiles::add); + } + + ContainerLaunchContext setupApplicationMasterContainer( + String yarnClusterEntrypoint, boolean hasKrb5, JobManagerProcessSpec processSpec) { + // ------------------ Prepare Application Master Container ------------------------------ + + // respect custom JVM options in the YAML file + String javaOpts = flinkConfiguration.getString(CoreOptions.FLINK_JVM_OPTIONS); + if (flinkConfiguration.getString(CoreOptions.FLINK_JM_JVM_OPTIONS).length() > 0) { + javaOpts += " " + flinkConfiguration.getString(CoreOptions.FLINK_JM_JVM_OPTIONS); + } + + // krb5.conf file will be available as local resource in JM/TM container + if (hasKrb5) { + javaOpts += " -Djava.security.krb5.conf=krb5.conf"; + } + + // Set up the container launch context for the application master + ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); + + final Map startCommandValues = new HashMap<>(); + startCommandValues.put("java", "$JAVA_HOME/bin/java"); + + String jvmHeapMem = JobManagerProcessUtils.generateJvmParametersStr(processSpec, flinkConfiguration); + startCommandValues.put("jvmmem", jvmHeapMem); + + startCommandValues.put("jvmopts", javaOpts); + startCommandValues.put("logging", YarnLogConfigUtil.getLoggingYarnCommand(flinkConfiguration)); + + startCommandValues.put("class", yarnClusterEntrypoint); + startCommandValues.put( + "redirects", + "1> " + + ApplicationConstants.LOG_DIR_EXPANSION_VAR + + "/jobmanager.out " + + "2> " + + ApplicationConstants.LOG_DIR_EXPANSION_VAR + + "/jobmanager.err"); + String dynamicParameterListStr = JobManagerProcessUtils.generateDynamicConfigsStr(processSpec); + startCommandValues.put("args", dynamicParameterListStr); + + final String commandTemplate = flinkConfiguration.getString( + ConfigConstants.YARN_CONTAINER_START_COMMAND_TEMPLATE, + ConfigConstants.DEFAULT_YARN_CONTAINER_START_COMMAND_TEMPLATE); + final String amCommand = BootstrapTools.getStartCommand(commandTemplate, startCommandValues); + + amContainer.setCommands(Collections.singletonList(amCommand)); + + LOG.debug("Application Master start command: " + amCommand); + + return amContainer; + } + + private static YarnConfigOptions.UserJarInclusion getUserJarInclusionMode( + org.apache.flink.configuration.Configuration config) { + return config.get(YarnConfigOptions.CLASSPATH_INCLUDE_USER_JAR); + } + + private static boolean isUsrLibDirIncludedInShipFiles(List shipFiles) { + return shipFiles.stream() + .filter(File::isDirectory) + .map(File::getName) + .anyMatch(name -> name.equals(DEFAULT_FLINK_USR_LIB_DIR)); + } + + private void setClusterEntrypointInfoToConfig(final ApplicationReport report) { + checkNotNull(report); + + final ApplicationId appId = report.getApplicationId(); + final String host = report.getHost(); + final int port = report.getRpcPort(); + + LOG.info("Found Web Interface {}:{} of application '{}'.", host, port, appId); + + flinkConfiguration.setString(JobManagerOptions.ADDRESS, host); + flinkConfiguration.setInteger(JobManagerOptions.PORT, port); + + flinkConfiguration.setString(RestOptions.ADDRESS, host); + flinkConfiguration.setInteger(RestOptions.PORT, port); + + flinkConfiguration.set(YarnConfigOptions.APPLICATION_ID, ConverterUtils.toString(appId)); + + setHAClusterIdIfNotSet(flinkConfiguration, appId); + } + + private void setHAClusterIdIfNotSet(Configuration configuration, ApplicationId appId) { + // set cluster-id to app id if not specified + if (!configuration.contains(HighAvailabilityOptions.HA_CLUSTER_ID)) { + configuration.set(HighAvailabilityOptions.HA_CLUSTER_ID, ConverterUtils.toString(appId)); + } + } + + public static void logDetachedClusterInformation(ApplicationId yarnApplicationId, Logger logger) { + logger.info( + "The Flink YARN session cluster has been started in detached mode. In order to " + + "stop Flink gracefully, use the following command:\n" + + "$ echo \"stop\" | ./bin/yarn-session.sh -id {}\n" + + "If this should not be possible, then you can also kill Flink via YARN's web interface or via:\n" + + "$ yarn application -kill {}\n" + + "Note that killing Flink might not clean up all job artifacts and temporary files.", + yarnApplicationId, + yarnApplicationId); + } + + @VisibleForTesting + Map generateApplicationMasterEnv( + final YarnApplicationFileUploader fileUploader, + final String classPathStr, + final String localFlinkJarStr, + final String appIdStr) + throws IOException { + final Map env = new HashMap<>(); + // set user specified app master environment variables + env.putAll(ConfigurationUtils.getPrefixedKeyValuePairs( + ResourceManagerOptions.CONTAINERIZED_MASTER_ENV_PREFIX, this.flinkConfiguration)); + // set Flink app class path + env.put(ENV_FLINK_CLASSPATH, classPathStr); + // Set FLINK_LIB_DIR to `lib` folder under working dir in container + env.put(ENV_FLINK_LIB_DIR, Path.CUR_DIR + "/" + ConfigConstants.DEFAULT_FLINK_LIB_DIR); + // Set FLINK_OPT_DIR to `opt` folder under working dir in container + env.put(ENV_FLINK_OPT_DIR, Path.CUR_DIR + "/" + ConfigConstants.DEFAULT_FLINK_OPT_DIR); + // set Flink on YARN internal configuration values + env.put(YarnConfigKeys.FLINK_DIST_JAR, localFlinkJarStr); + env.put(YarnConfigKeys.ENV_APP_ID, appIdStr); + env.put(YarnConfigKeys.ENV_CLIENT_HOME_DIR, fileUploader.getHomeDir().toString()); + env.put( + YarnConfigKeys.ENV_CLIENT_SHIP_FILES, + encodeYarnLocalResourceDescriptorListToString(fileUploader.getEnvShipResourceList())); + env.put( + YarnConfigKeys.FLINK_YARN_FILES, + fileUploader.getApplicationDir().toUri().toString()); + // https://github.com/apache/hadoop/blob/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/YarnApplicationSecurity.md#identity-on-an-insecure-cluster-hadoop_user_name + env.put( + YarnConfigKeys.ENV_HADOOP_USER_NAME, + UserGroupInformation.getCurrentUser().getUserName()); + // set classpath from YARN configuration + Utils.setupYarnClassPath(this.yarnConfiguration, env); + return env; + } +} diff --git a/dinky-client/dinky-client-1.16/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java b/dinky-client/dinky-client-1.16/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java index 54f63dcfce..c8c9104e8b 100644 --- a/dinky-client/dinky-client-1.16/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java +++ b/dinky-client/dinky-client-1.16/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java @@ -23,6 +23,7 @@ import org.dinky.context.DinkyClassLoaderContextHolder; import org.dinky.data.model.LineageRel; import org.dinky.data.result.SqlExplainResult; +import org.dinky.utils.JsonUtils; import org.dinky.utils.LineageContext; import org.apache.flink.api.dag.Transformation; @@ -50,9 +51,7 @@ import org.apache.flink.table.operations.ddl.CreateTableOperation; import org.apache.flink.types.Row; -import java.io.File; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; @@ -62,7 +61,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ObjectNode; @@ -118,19 +116,6 @@ public boolean parseAndLoadConfiguration(String statement, Map s return false; } - @Override - public void addJar(File... jarPath) { - Configuration configuration = this.getRootConfiguration(); - List jars = configuration.get(PipelineOptions.JARS); - if (jars == null) { - configuration.set( - PipelineOptions.JARS, - Arrays.stream(jarPath).map(File::getAbsolutePath).collect(Collectors.toList())); - } else { - CollUtil.addAll(jars, jarPath); - } - } - public ObjectNode getStreamGraph(String statement) { List operations = super.getParser().parse(statement); if (operations.size() != 1) { @@ -144,12 +129,7 @@ public ObjectNode getStreamGraph(String statement) { StreamGraph streamGraph = transOperatoinsToStreamGraph(modifyOperations); JSONGenerator jsonGenerator = new JSONGenerator(streamGraph); - try { - return (ObjectNode) mapper.readTree(jsonGenerator.getJSON()); - } catch (JsonProcessingException e) { - log.error("read streamGraph configure error: ", e); - return mapper.createObjectNode(); - } + return JsonUtils.parseObject(jsonGenerator.getJSON()); } private StreamGraph transOperatoinsToStreamGraph(List modifyOperations) { diff --git a/dinky-client/dinky-client-1.17/src/main/java/org/apache/flink/yarn/Utils.java b/dinky-client/dinky-client-1.17/src/main/java/org/apache/flink/yarn/Utils.java new file mode 100644 index 0000000000..614135fe33 --- /dev/null +++ b/dinky-client/dinky-client-1.17/src/main/java/org/apache/flink/yarn/Utils.java @@ -0,0 +1,578 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.flink.yarn; + +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.yarn.YarnConfigKeys.ENV_FLINK_CLASSPATH; +import static org.apache.flink.yarn.YarnConfigKeys.LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigUtils; +import org.apache.flink.runtime.clusterframework.BootstrapTools; +import org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters; +import org.apache.flink.runtime.util.HadoopUtils; +import org.apache.flink.util.StringUtils; +import org.apache.flink.util.function.FunctionWithException; +import org.apache.flink.yarn.configuration.YarnConfigOptions; +import org.apache.flink.yarn.configuration.YarnResourceManagerDriverConfiguration; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.security.Credentials; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.security.token.TokenIdentifier; +import org.apache.hadoop.util.StringInterner; +import org.apache.hadoop.yarn.api.ApplicationConstants; +import org.apache.hadoop.yarn.api.ApplicationConstants.Environment; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.LocalResource; +import org.apache.hadoop.yarn.api.records.LocalResourceType; +import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.security.AMRMTokenIdentifier; +import org.apache.hadoop.yarn.util.ConverterUtils; +import org.apache.hadoop.yarn.util.Records; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.stream.Stream; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import cn.hutool.core.util.StrUtil; + +/** Utility class that provides helper methods to work with Apache Hadoop YARN. */ +public final class Utils { + + private static final Logger LOG = LoggerFactory.getLogger(Utils.class); + + /** KRB5 file name populated in YARN container for secure IT run. */ + public static final String KRB5_FILE_NAME = "krb5.conf"; + + /** Yarn site xml file name populated in YARN container for secure IT run. */ + public static final String YARN_SITE_FILE_NAME = "yarn-site.xml"; + + /** The prefixes that Flink adds to the YARN config. */ + private static final String[] FLINK_CONFIG_PREFIXES = {"flink.yarn."}; + + @VisibleForTesting + static final String YARN_RM_FAIR_SCHEDULER_CLAZZ = + "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler"; + + @VisibleForTesting + static final String YARN_RM_SLS_FAIR_SCHEDULER_CLAZZ = "org.apache.hadoop.yarn.sls.scheduler.SLSFairScheduler"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_MB_KEY = "yarn.resource-types.memory-mb.increment-allocation"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_MB_LEGACY_KEY = "yarn.scheduler.increment-allocation-mb"; + + private static final int DEFAULT_YARN_RM_INCREMENT_ALLOCATION_MB = 1024; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_VCORES_KEY = "yarn.resource-types.vcores.increment-allocation"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_VCORES_LEGACY_KEY = "yarn.scheduler.increment-allocation-vcores"; + + private static final int DEFAULT_YARN_RM_INCREMENT_ALLOCATION_VCORES = 1; + + public static void setupYarnClassPath(Configuration conf, Map appMasterEnv) { + addToEnvironment(appMasterEnv, Environment.CLASSPATH.name(), appMasterEnv.get(ENV_FLINK_CLASSPATH)); + String[] applicationClassPathEntries = conf.getStrings( + YarnConfiguration.YARN_APPLICATION_CLASSPATH, + Stream.of(YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH) + .map(x -> StrUtil.removeAll(x, "%")) + .map(x -> "$".equals(StrUtil.subPre(x, 1)) ? x : "$" + x) + .toArray(String[]::new)); + for (String c : applicationClassPathEntries) { + addToEnvironment(appMasterEnv, Environment.CLASSPATH.name(), c.trim()); + } + } + + /** + * Deletes the YARN application files, e.g., Flink binaries, libraries, etc., from the remote + * filesystem. + * + * @param applicationFilesDir The application files directory. + */ + public static void deleteApplicationFiles(final String applicationFilesDir) { + if (!StringUtils.isNullOrWhitespaceOnly(applicationFilesDir)) { + final org.apache.flink.core.fs.Path path = new org.apache.flink.core.fs.Path(applicationFilesDir); + try { + final org.apache.flink.core.fs.FileSystem fileSystem = path.getFileSystem(); + if (!fileSystem.delete(path, true)) { + LOG.error("Deleting yarn application files under {} was unsuccessful.", applicationFilesDir); + } + } catch (final IOException e) { + LOG.error("Could not properly delete yarn application files directory {}.", applicationFilesDir, e); + } + } else { + LOG.debug("No yarn application files directory set. Therefore, cannot clean up the data."); + } + } + + /** + * Creates a YARN resource for the remote object at the given location. + * + * @param remoteRsrcPath remote location of the resource + * @param resourceSize size of the resource + * @param resourceModificationTime last modification time of the resource + * @return YARN resource + */ + static LocalResource registerLocalResource( + Path remoteRsrcPath, + long resourceSize, + long resourceModificationTime, + LocalResourceVisibility resourceVisibility, + LocalResourceType resourceType) { + LocalResource localResource = Records.newRecord(LocalResource.class); + localResource.setResource(ConverterUtils.getYarnUrlFromURI(remoteRsrcPath.toUri())); + localResource.setSize(resourceSize); + localResource.setTimestamp(resourceModificationTime); + localResource.setType(resourceType); + localResource.setVisibility(resourceVisibility); + return localResource; + } + + /** + * Creates a YARN resource for the remote object at the given location. + * + * @param fs remote filesystem + * @param remoteRsrcPath resource path to be registered + * @return YARN resource + */ + private static LocalResource registerLocalResource( + FileSystem fs, Path remoteRsrcPath, LocalResourceType resourceType) throws IOException { + FileStatus jarStat = fs.getFileStatus(remoteRsrcPath); + return registerLocalResource( + remoteRsrcPath, + jarStat.getLen(), + jarStat.getModificationTime(), + LocalResourceVisibility.APPLICATION, + resourceType); + } + + /** + * Copied method from org.apache.hadoop.yarn.util.Apps. It was broken by YARN-1824 (2.4.0) and + * fixed for 2.4.1 by https://issues.apache.org/jira/browse/YARN-1931 + */ + public static void addToEnvironment(Map environment, String variable, String value) { + String val = environment.get(variable); + if (val == null) { + val = value; + } else { + val = val + YarnClusterDescriptor.pathSeparator + value; + } + environment.put(StringInterner.weakIntern(variable), StringInterner.weakIntern(val)); + } + + /** + * Resolve keytab path either as absolute path or relative to working directory. + * + * @param workingDir current working directory + * @param keytabPath configured keytab path. + * @return resolved keytab path, or null if not found. + */ + public static String resolveKeytabPath(String workingDir, String keytabPath) { + String keytab = null; + if (keytabPath != null) { + File f; + f = new File(keytabPath); + if (f.exists()) { + keytab = f.getAbsolutePath(); + LOG.info("Resolved keytab path: {}", keytab); + } else { + // try using relative paths, this is the case when the keytab was shipped + // as a local resource + f = new File(workingDir, keytabPath); + if (f.exists()) { + keytab = f.getAbsolutePath(); + LOG.info("Resolved keytab path: {}", keytab); + } else { + LOG.warn("Could not resolve keytab path with: {}", keytabPath); + keytab = null; + } + } + } + return keytab; + } + + /** Private constructor to prevent instantiation. */ + private Utils() { + throw new RuntimeException(); + } + + /** + * Creates the launch context, which describes how to bring up a TaskExecutor / TaskManager + * process in an allocated YARN container. + * + *

This code is extremely YARN specific and registers all the resources that the TaskExecutor + * needs (such as JAR file, config file, ...) and all environment variables in a YARN container + * launch context. The launch context then ensures that those resources will be copied into the + * containers transient working directory. + * + * @param flinkConfig The Flink configuration object. + * @param yarnConfig The YARN configuration object. + * @param configuration The YarnResourceManagerDriver configurations. + * @param tmParams The TaskExecutor container memory parameters. + * @param taskManagerDynamicProperties The dynamic configurations to be updated for the + * TaskExecutors based on client uploaded Flink config. + * @param workingDirectory The current application master container's working directory. + * @param taskManagerMainClass The class with the main method. + * @param log The logger. + * @return The launch context for the TaskManager processes. + * @throws Exception Thrown if the launch context could not be created, for example if the + * resources could not be copied. + */ + static ContainerLaunchContext createTaskExecutorContext( + org.apache.flink.configuration.Configuration flinkConfig, + YarnConfiguration yarnConfig, + YarnResourceManagerDriverConfiguration configuration, + ContaineredTaskManagerParameters tmParams, + String taskManagerDynamicProperties, + String workingDirectory, + Class taskManagerMainClass, + Logger log) + throws Exception { + + // get and validate all relevant variables + + String remoteFlinkJarPath = checkNotNull( + configuration.getFlinkDistJar(), "Environment variable %s not set", YarnConfigKeys.FLINK_DIST_JAR); + + String shipListString = checkNotNull( + configuration.getClientShipFiles(), + "Environment variable %s not set", + YarnConfigKeys.ENV_CLIENT_SHIP_FILES); + + final String remoteKeytabPath = configuration.getRemoteKeytabPath(); + final String localKeytabPath = configuration.getLocalKeytabPath(); + final String keytabPrincipal = configuration.getKeytabPrinciple(); + final String remoteYarnConfPath = configuration.getYarnSiteXMLPath(); + final String remoteKrb5Path = configuration.getKrb5Path(); + + if (log.isDebugEnabled()) { + log.debug("TM:remote keytab path obtained {}", remoteKeytabPath); + log.debug("TM:local keytab path obtained {}", localKeytabPath); + log.debug("TM:keytab principal obtained {}", keytabPrincipal); + log.debug("TM:remote yarn conf path obtained {}", remoteYarnConfPath); + log.debug("TM:remote krb5 path obtained {}", remoteKrb5Path); + } + + String classPathString = checkNotNull( + configuration.getFlinkClasspath(), + "Environment variable %s not set", + YarnConfigKeys.ENV_FLINK_CLASSPATH); + + // register keytab + LocalResource keytabResource = null; + if (remoteKeytabPath != null) { + log.info("TM:Adding keytab {} to the container local resource bucket", remoteKeytabPath); + Path keytabPath = new Path(remoteKeytabPath); + FileSystem fs = keytabPath.getFileSystem(yarnConfig); + keytabResource = registerLocalResource(fs, keytabPath, LocalResourceType.FILE); + } + + // To support Yarn Secure Integration Test Scenario + LocalResource yarnConfResource = null; + if (remoteYarnConfPath != null) { + log.info("TM:Adding remoteYarnConfPath {} to the container local resource bucket", remoteYarnConfPath); + Path yarnConfPath = new Path(remoteYarnConfPath); + FileSystem fs = yarnConfPath.getFileSystem(yarnConfig); + yarnConfResource = registerLocalResource(fs, yarnConfPath, LocalResourceType.FILE); + } + + // register krb5.conf + LocalResource krb5ConfResource = null; + boolean hasKrb5 = false; + if (remoteKrb5Path != null) { + log.info("Adding remoteKrb5Path {} to the container local resource bucket", remoteKrb5Path); + Path krb5ConfPath = new Path(remoteKrb5Path); + FileSystem fs = krb5ConfPath.getFileSystem(yarnConfig); + krb5ConfResource = registerLocalResource(fs, krb5ConfPath, LocalResourceType.FILE); + hasKrb5 = true; + } + + Map taskManagerLocalResources = new HashMap<>(); + + // register Flink Jar with remote HDFS + final YarnLocalResourceDescriptor flinkDistLocalResourceDesc = + YarnLocalResourceDescriptor.fromString(remoteFlinkJarPath); + taskManagerLocalResources.put( + flinkDistLocalResourceDesc.getResourceKey(), flinkDistLocalResourceDesc.toLocalResource()); + + // To support Yarn Secure Integration Test Scenario + if (yarnConfResource != null) { + taskManagerLocalResources.put(YARN_SITE_FILE_NAME, yarnConfResource); + } + if (krb5ConfResource != null) { + taskManagerLocalResources.put(KRB5_FILE_NAME, krb5ConfResource); + } + if (keytabResource != null) { + taskManagerLocalResources.put(localKeytabPath, keytabResource); + } + + // prepare additional files to be shipped + decodeYarnLocalResourceDescriptorListFromString(shipListString) + .forEach(resourceDesc -> + taskManagerLocalResources.put(resourceDesc.getResourceKey(), resourceDesc.toLocalResource())); + + // now that all resources are prepared, we can create the launch context + + log.info("Creating container launch context for TaskManagers"); + + boolean hasLogback = new File(workingDirectory, "logback.xml").exists(); + boolean hasLog4j = new File(workingDirectory, "log4j.properties").exists(); + + String launchCommand = BootstrapTools.getTaskManagerShellCommand( + flinkConfig, + tmParams, + ".", + ApplicationConstants.LOG_DIR_EXPANSION_VAR, + hasLogback, + hasLog4j, + hasKrb5, + taskManagerMainClass, + taskManagerDynamicProperties); + + if (log.isDebugEnabled()) { + log.debug("Starting TaskManagers with command: " + launchCommand); + } else { + log.info("Starting TaskManagers"); + } + + ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class); + ctx.setCommands(Collections.singletonList(launchCommand)); + ctx.setLocalResources(taskManagerLocalResources); + + Map containerEnv = new HashMap<>(); + containerEnv.putAll(tmParams.taskManagerEnv()); + + // add YARN classpath, etc to the container environment + containerEnv.put(ENV_FLINK_CLASSPATH, classPathString); + setupYarnClassPath(yarnConfig, containerEnv); + + containerEnv.put( + YarnConfigKeys.ENV_HADOOP_USER_NAME, + UserGroupInformation.getCurrentUser().getUserName()); + + if (remoteKeytabPath != null && localKeytabPath != null && keytabPrincipal != null) { + containerEnv.put(YarnConfigKeys.REMOTE_KEYTAB_PATH, remoteKeytabPath); + containerEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localKeytabPath); + containerEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, keytabPrincipal); + } else if (localKeytabPath != null && keytabPrincipal != null) { + containerEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localKeytabPath); + containerEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, keytabPrincipal); + } + + ctx.setEnvironment(containerEnv); + + // For TaskManager YARN container context, read the tokens from the jobmanager yarn + // container local file. + // NOTE: must read the tokens from the local file, not from the UGI context, because if UGI + // is login + // using Kerberos keytabs, there is no HDFS delegation token in the UGI context. + final String fileLocation = System.getenv(UserGroupInformation.HADOOP_TOKEN_FILE_LOCATION); + + if (fileLocation != null) { + log.debug("Adding security tokens to TaskExecutor's container launch context."); + + try (DataOutputBuffer dob = new DataOutputBuffer()) { + Credentials cred = Credentials.readTokenStorageFile( + new File(fileLocation), HadoopUtils.getHadoopConfiguration(flinkConfig)); + + // Filter out AMRMToken before setting the tokens to the TaskManager container + // context. + Credentials taskManagerCred = new Credentials(); + Collection> userTokens = cred.getAllTokens(); + for (Token token : userTokens) { + if (!token.getKind().equals(AMRMTokenIdentifier.KIND_NAME)) { + taskManagerCred.addToken(token.getService(), token); + } + } + + taskManagerCred.writeTokenStorageToStream(dob); + ByteBuffer securityTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); + ctx.setTokens(securityTokens); + } catch (Throwable t) { + log.error("Failed to add Hadoop's security tokens.", t); + } + } else { + log.info("Could not set security tokens because Hadoop's token file location is unknown."); + } + + return ctx; + } + + static boolean isRemotePath(String path) throws IOException { + org.apache.flink.core.fs.Path flinkPath = new org.apache.flink.core.fs.Path(path); + return flinkPath.getFileSystem().isDistributedFS(); + } + + private static List decodeYarnLocalResourceDescriptorListFromString(String resources) + throws Exception { + final List resourceDescriptors = new ArrayList<>(); + for (String shipResourceDescStr : resources.split(LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR)) { + if (!shipResourceDescStr.isEmpty()) { + resourceDescriptors.add(YarnLocalResourceDescriptor.fromString(shipResourceDescStr)); + } + } + return resourceDescriptors; + } + + @VisibleForTesting + static Resource getUnitResource(YarnConfiguration yarnConfig) { + final int unitMemMB, unitVcore; + + final String yarnRmSchedulerClazzName = yarnConfig.get(YarnConfiguration.RM_SCHEDULER); + if (Objects.equals(yarnRmSchedulerClazzName, YARN_RM_FAIR_SCHEDULER_CLAZZ) + || Objects.equals(yarnRmSchedulerClazzName, YARN_RM_SLS_FAIR_SCHEDULER_CLAZZ)) { + String propMem = yarnConfig.get(YARN_RM_INCREMENT_ALLOCATION_MB_KEY); + String propVcore = yarnConfig.get(YARN_RM_INCREMENT_ALLOCATION_VCORES_KEY); + + unitMemMB = propMem != null + ? Integer.parseInt(propMem) + : yarnConfig.getInt( + YARN_RM_INCREMENT_ALLOCATION_MB_LEGACY_KEY, DEFAULT_YARN_RM_INCREMENT_ALLOCATION_MB); + unitVcore = propVcore != null + ? Integer.parseInt(propVcore) + : yarnConfig.getInt( + YARN_RM_INCREMENT_ALLOCATION_VCORES_LEGACY_KEY, + DEFAULT_YARN_RM_INCREMENT_ALLOCATION_VCORES); + } else { + unitMemMB = yarnConfig.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB); + unitVcore = yarnConfig.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES); + } + + return Resource.newInstance(unitMemMB, unitVcore); + } + + public static List getQualifiedRemoteProvidedLibDirs( + org.apache.flink.configuration.Configuration configuration, YarnConfiguration yarnConfiguration) + throws IOException { + + return getRemoteSharedLibPaths(configuration, pathStr -> { + final Path path = new Path(pathStr); + return path.getFileSystem(yarnConfiguration).makeQualified(path); + }); + } + + private static List getRemoteSharedLibPaths( + org.apache.flink.configuration.Configuration configuration, + FunctionWithException strToPathMapper) + throws IOException { + + final List providedLibDirs = + ConfigUtils.decodeListFromConfig(configuration, YarnConfigOptions.PROVIDED_LIB_DIRS, strToPathMapper); + + for (Path path : providedLibDirs) { + if (!Utils.isRemotePath(path.toString())) { + throw new IllegalArgumentException("The \"" + + YarnConfigOptions.PROVIDED_LIB_DIRS.key() + + "\" should only contain" + + " dirs accessible from all worker nodes, while the \"" + + path + + "\" is local."); + } + } + return providedLibDirs; + } + + public static boolean isUsrLibDirectory(final FileSystem fileSystem, final Path path) throws IOException { + final FileStatus fileStatus = fileSystem.getFileStatus(path); + // Use the Path obj from fileStatus to get rid of trailing slash + return fileStatus.isDirectory() + && ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR.equals( + fileStatus.getPath().getName()); + } + + public static Optional getQualifiedRemoteProvidedUsrLib( + org.apache.flink.configuration.Configuration configuration, YarnConfiguration yarnConfiguration) + throws IOException, IllegalArgumentException { + String usrlib = configuration.getString(YarnConfigOptions.PROVIDED_USRLIB_DIR); + if (usrlib == null) { + return Optional.empty(); + } + final Path qualifiedUsrLibPath = FileSystem.get(yarnConfiguration).makeQualified(new Path(usrlib)); + checkArgument( + isRemotePath(qualifiedUsrLibPath.toString()), + "The \"%s\" must point to a remote dir " + "which is accessible from all worker nodes.", + YarnConfigOptions.PROVIDED_USRLIB_DIR.key()); + checkArgument( + isUsrLibDirectory(FileSystem.get(yarnConfiguration), qualifiedUsrLibPath), + "The \"%s\" should be named with \"%s\".", + YarnConfigOptions.PROVIDED_USRLIB_DIR.key(), + ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR); + return Optional.of(qualifiedUsrLibPath); + } + + public static YarnConfiguration getYarnAndHadoopConfiguration( + org.apache.flink.configuration.Configuration flinkConfig) { + final YarnConfiguration yarnConfig = getYarnConfiguration(flinkConfig); + yarnConfig.addResource(HadoopUtils.getHadoopConfiguration(flinkConfig)); + + return yarnConfig; + } + + /** + * Add additional config entries from the flink config to the yarn config. + * + * @param flinkConfig The Flink configuration object. + * @return The yarn configuration. + */ + public static YarnConfiguration getYarnConfiguration(org.apache.flink.configuration.Configuration flinkConfig) { + final YarnConfiguration yarnConfig = new YarnConfiguration(); + + for (String key : flinkConfig.keySet()) { + for (String prefix : FLINK_CONFIG_PREFIXES) { + if (key.startsWith(prefix)) { + String newKey = key.substring("flink.".length()); + String value = flinkConfig.getString(key, null); + yarnConfig.set(newKey, value); + LOG.debug("Adding Flink config entry for {} as {}={} to Yarn config", key, newKey, value); + } + } + } + + return yarnConfig; + } +} diff --git a/dinky-client/dinky-client-1.17/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java b/dinky-client/dinky-client-1.17/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java new file mode 100644 index 0000000000..859c248b26 --- /dev/null +++ b/dinky-client/dinky-client-1.17/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java @@ -0,0 +1,1753 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.flink.yarn; + +import static org.apache.flink.client.deployment.application.ApplicationConfiguration.APPLICATION_MAIN_CLASS; +import static org.apache.flink.configuration.ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR; +import static org.apache.flink.configuration.ConfigConstants.ENV_FLINK_LIB_DIR; +import static org.apache.flink.configuration.ConfigConstants.ENV_FLINK_OPT_DIR; +import static org.apache.flink.runtime.entrypoint.component.FileJobGraphRetriever.JOB_GRAPH_FILE_PATH; +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.yarn.YarnConfigKeys.ENV_FLINK_CLASSPATH; +import static org.apache.flink.yarn.YarnConfigKeys.LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.cache.DistributedCache; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.client.deployment.ClusterDeploymentException; +import org.apache.flink.client.deployment.ClusterDescriptor; +import org.apache.flink.client.deployment.ClusterRetrieveException; +import org.apache.flink.client.deployment.ClusterSpecification; +import org.apache.flink.client.deployment.application.ApplicationConfiguration; +import org.apache.flink.client.program.ClusterClientProvider; +import org.apache.flink.client.program.PackagedProgramUtils; +import org.apache.flink.client.program.rest.RestClusterClient; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigUtils; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ConfigurationUtils; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.configuration.HighAvailabilityOptions; +import org.apache.flink.configuration.IllegalConfigurationException; +import org.apache.flink.configuration.JobManagerOptions; +import org.apache.flink.configuration.PipelineOptions; +import org.apache.flink.configuration.ResourceManagerOptions; +import org.apache.flink.configuration.RestOptions; +import org.apache.flink.configuration.SecurityOptions; +import org.apache.flink.configuration.TaskManagerOptions; +import org.apache.flink.core.plugin.PluginConfig; +import org.apache.flink.core.plugin.PluginUtils; +import org.apache.flink.runtime.clusterframework.BootstrapTools; +import org.apache.flink.runtime.entrypoint.ClusterEntrypoint; +import org.apache.flink.runtime.entrypoint.ClusterEntrypointUtils; +import org.apache.flink.runtime.jobgraph.JobGraph; +import org.apache.flink.runtime.jobmanager.HighAvailabilityMode; +import org.apache.flink.runtime.jobmanager.JobManagerProcessSpec; +import org.apache.flink.runtime.jobmanager.JobManagerProcessUtils; +import org.apache.flink.runtime.security.token.DefaultDelegationTokenManager; +import org.apache.flink.runtime.security.token.DelegationTokenContainer; +import org.apache.flink.runtime.security.token.DelegationTokenManager; +import org.apache.flink.runtime.security.token.hadoop.HadoopDelegationTokenConverter; +import org.apache.flink.runtime.security.token.hadoop.KerberosLoginProvider; +import org.apache.flink.runtime.util.HadoopUtils; +import org.apache.flink.util.CollectionUtil; +import org.apache.flink.util.FlinkException; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.ShutdownHookUtil; +import org.apache.flink.util.StringUtils; +import org.apache.flink.yarn.configuration.YarnConfigOptions; +import org.apache.flink.yarn.configuration.YarnConfigOptionsInternal; +import org.apache.flink.yarn.configuration.YarnDeploymentTarget; +import org.apache.flink.yarn.configuration.YarnLogConfigUtil; +import org.apache.flink.yarn.entrypoint.YarnApplicationClusterEntryPoint; +import org.apache.flink.yarn.entrypoint.YarnJobClusterEntrypoint; +import org.apache.flink.yarn.entrypoint.YarnSessionClusterEntrypoint; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.security.Credentials; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.security.token.TokenIdentifier; +import org.apache.hadoop.yarn.api.ApplicationConstants; +import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.ApplicationReport; +import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; +import org.apache.hadoop.yarn.api.records.LocalResourceType; +import org.apache.hadoop.yarn.api.records.NodeReport; +import org.apache.hadoop.yarn.api.records.NodeState; +import org.apache.hadoop.yarn.api.records.Priority; +import org.apache.hadoop.yarn.api.records.QueueInfo; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.YarnApplicationState; +import org.apache.hadoop.yarn.api.records.YarnClusterMetrics; +import org.apache.hadoop.yarn.client.api.YarnClient; +import org.apache.hadoop.yarn.client.api.YarnClientApplication; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.util.ConverterUtils; +import org.apache.hadoop.yarn.util.Records; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.PrintStream; +import java.io.UnsupportedEncodingException; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.net.URI; +import java.net.URLDecoder; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** The descriptor with deployment information for deploying a Flink cluster on Yarn. */ +public class YarnClusterDescriptor implements ClusterDescriptor { + private static final Logger LOG = LoggerFactory.getLogger(YarnClusterDescriptor.class); + public static final String pathSeparator = ":"; + + private final YarnConfiguration yarnConfiguration; + + private final YarnClient yarnClient; + + private final YarnClusterInformationRetriever yarnClusterInformationRetriever; + + /** True if the descriptor must not shut down the YarnClient. */ + private final boolean sharedYarnClient; + + /** Lazily initialized list of files to ship. */ + private final List shipFiles = new LinkedList<>(); + + private final List shipArchives = new LinkedList<>(); + + private final String yarnQueue; + + private Path flinkJarPath; + + private final Configuration flinkConfiguration; + + private final String customName; + + private final String nodeLabel; + + private final String applicationType; + + private YarnConfigOptions.UserJarInclusion userJarInclusion; + + public YarnClusterDescriptor( + Configuration flinkConfiguration, + YarnConfiguration yarnConfiguration, + YarnClient yarnClient, + YarnClusterInformationRetriever yarnClusterInformationRetriever, + boolean sharedYarnClient) { + + this.yarnConfiguration = Preconditions.checkNotNull(yarnConfiguration); + this.yarnClient = Preconditions.checkNotNull(yarnClient); + this.yarnClusterInformationRetriever = Preconditions.checkNotNull(yarnClusterInformationRetriever); + this.sharedYarnClient = sharedYarnClient; + + this.flinkConfiguration = Preconditions.checkNotNull(flinkConfiguration); + this.userJarInclusion = getUserJarInclusionMode(flinkConfiguration); + + getLocalFlinkDistPath(flinkConfiguration).ifPresent(this::setLocalJarPath); + decodeFilesToShipToCluster(flinkConfiguration, YarnConfigOptions.SHIP_FILES) + .ifPresent(this::addShipFiles); + decodeFilesToShipToCluster(flinkConfiguration, YarnConfigOptions.SHIP_ARCHIVES) + .ifPresent(this::addShipArchives); + + this.yarnQueue = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_QUEUE); + this.customName = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_NAME); + this.applicationType = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_TYPE); + this.nodeLabel = flinkConfiguration.getString(YarnConfigOptions.NODE_LABEL); + } + + private Optional> decodeFilesToShipToCluster( + final Configuration configuration, final ConfigOption> configOption) { + checkNotNull(configuration); + checkNotNull(configOption); + + final List files = ConfigUtils.decodeListFromConfig(configuration, configOption, File::new); + return files.isEmpty() ? Optional.empty() : Optional.of(files); + } + + private Optional getLocalFlinkDistPath(final Configuration configuration) { + final String localJarPath = configuration.getString(YarnConfigOptions.FLINK_DIST_JAR); + if (localJarPath != null) { + return Optional.of(new Path(localJarPath)); + } + + LOG.info("No path for the flink jar passed. Using the location of " + getClass() + " to locate the jar"); + + // check whether it's actually a jar file --> when testing we execute this class without a + // flink-dist jar + final String decodedPath = getDecodedJarPath(); + return decodedPath.endsWith(".jar") ? Optional.of(new Path(new File(decodedPath).toURI())) : Optional.empty(); + } + + private String getDecodedJarPath() { + final String encodedJarPath = YarnClusterClientFactory.class + .getProtectionDomain() + .getCodeSource() + .getLocation() + .getPath(); + try { + return URLDecoder.decode(encodedJarPath, Charset.defaultCharset().name()); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException("Couldn't decode the encoded Flink dist jar path: " + + encodedJarPath + + " You can supply a path manually via the command line."); + } + } + + @VisibleForTesting + List getShipFiles() { + return shipFiles; + } + + public YarnClient getYarnClient() { + return yarnClient; + } + + /** + * The class to start the application master with. This class runs the main method in case of + * session cluster. + */ + protected String getYarnSessionClusterEntrypoint() { + return YarnSessionClusterEntrypoint.class.getName(); + } + + /** + * The class to start the application master with. This class runs the main method in case of + * the job cluster. + */ + protected String getYarnJobClusterEntrypoint() { + return YarnJobClusterEntrypoint.class.getName(); + } + + public Configuration getFlinkConfiguration() { + return flinkConfiguration; + } + + public void setLocalJarPath(Path localJarPath) { + if (!localJarPath.toString().endsWith("jar")) { + throw new IllegalArgumentException( + "The passed jar path ('" + localJarPath + "') does not end with the 'jar' extension"); + } + this.flinkJarPath = localJarPath; + } + + /** + * Adds the given files to the list of files to ship. + * + *

Note that any file matching "flink-dist*.jar" will be excluded from the upload by + * {@link YarnApplicationFileUploader#registerMultipleLocalResources(Collection, String, + * LocalResourceType)} since we upload the Flink uber jar ourselves and do not need to deploy it + * multiple times. + * + * @param shipFiles files to ship + */ + public void addShipFiles(List shipFiles) { + checkArgument( + !isUsrLibDirIncludedInShipFiles(shipFiles), + "User-shipped directories configured via : %s should not include %s.", + YarnConfigOptions.SHIP_FILES.key(), + ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR); + this.shipFiles.addAll(shipFiles); + } + + private void addShipArchives(List shipArchives) { + checkArgument(isArchiveOnlyIncludedInShipArchiveFiles(shipArchives), "Non-archive files are included."); + this.shipArchives.addAll(shipArchives); + } + + private static boolean isArchiveOnlyIncludedInShipArchiveFiles(List shipFiles) { + return shipFiles.stream() + .filter(File::isFile) + .map(File::getName) + .map(String::toLowerCase) + .allMatch(name -> name.endsWith(".tar.gz") + || name.endsWith(".tar") + || name.endsWith(".tgz") + || name.endsWith(".dst") + || name.endsWith(".jar") + || name.endsWith(".zip")); + } + + private void isReadyForDeployment(ClusterSpecification clusterSpecification) throws Exception { + + if (this.flinkJarPath == null) { + throw new YarnDeploymentException("The Flink jar path is null"); + } + if (this.flinkConfiguration == null) { + throw new YarnDeploymentException("Flink configuration object has not been set"); + } + + // Check if we don't exceed YARN's maximum virtual cores. + final int numYarnMaxVcores = yarnClusterInformationRetriever.getMaxVcores(); + + int configuredAmVcores = flinkConfiguration.getInteger(YarnConfigOptions.APP_MASTER_VCORES); + if (configuredAmVcores > numYarnMaxVcores) { + throw new IllegalConfigurationException(String.format( + "The number of requested virtual cores for application master %d" + + " exceeds the maximum number of virtual cores %d available in the Yarn Cluster.", + configuredAmVcores, numYarnMaxVcores)); + } + + int configuredVcores = + flinkConfiguration.getInteger(YarnConfigOptions.VCORES, clusterSpecification.getSlotsPerTaskManager()); + // don't configure more than the maximum configured number of vcores + if (configuredVcores > numYarnMaxVcores) { + throw new IllegalConfigurationException(String.format( + "The number of requested virtual cores per node %d" + + " exceeds the maximum number of virtual cores %d available in the Yarn Cluster." + + " Please note that the number of virtual cores is set to the number of task slots by default" + + " unless configured in the Flink config with '%s.'", + configuredVcores, numYarnMaxVcores, YarnConfigOptions.VCORES.key())); + } + + // check if required Hadoop environment variables are set. If not, warn user + if (System.getenv("HADOOP_CONF_DIR") == null && System.getenv("YARN_CONF_DIR") == null) { + LOG.warn("Neither the HADOOP_CONF_DIR nor the YARN_CONF_DIR environment variable is set. " + + "The Flink YARN Client needs one of these to be set to properly load the Hadoop " + + "configuration for accessing YARN."); + } + } + + public String getNodeLabel() { + return nodeLabel; + } + + // ------------------------------------------------------------- + // Lifecycle management + // ------------------------------------------------------------- + + @Override + public void close() { + if (!sharedYarnClient) { + yarnClient.stop(); + } + } + + // ------------------------------------------------------------- + // ClusterClient overrides + // ------------------------------------------------------------- + + @Override + public ClusterClientProvider retrieve(ApplicationId applicationId) throws ClusterRetrieveException { + + try { + // check if required Hadoop environment variables are set. If not, warn user + if (System.getenv("HADOOP_CONF_DIR") == null && System.getenv("YARN_CONF_DIR") == null) { + LOG.warn("Neither the HADOOP_CONF_DIR nor the YARN_CONF_DIR environment variable is set." + + "The Flink YARN Client needs one of these to be set to properly load the Hadoop " + + "configuration for accessing YARN."); + } + + final ApplicationReport report = yarnClient.getApplicationReport(applicationId); + + if (report.getFinalApplicationStatus() != FinalApplicationStatus.UNDEFINED) { + // Flink cluster is not running anymore + LOG.error( + "The application {} doesn't run anymore. It has previously completed with final status: {}", + applicationId, + report.getFinalApplicationStatus()); + throw new RuntimeException("The Yarn application " + applicationId + " doesn't run anymore."); + } + + setClusterEntrypointInfoToConfig(report); + + return () -> { + try { + return new RestClusterClient<>(flinkConfiguration, report.getApplicationId()); + } catch (Exception e) { + throw new RuntimeException("Couldn't retrieve Yarn cluster", e); + } + }; + } catch (Exception e) { + throw new ClusterRetrieveException("Couldn't retrieve Yarn cluster", e); + } + } + + @Override + public ClusterClientProvider deploySessionCluster(ClusterSpecification clusterSpecification) + throws ClusterDeploymentException { + try { + return deployInternal( + clusterSpecification, "Flink session cluster", getYarnSessionClusterEntrypoint(), null, false); + } catch (Exception e) { + throw new ClusterDeploymentException("Couldn't deploy Yarn session cluster", e); + } + } + + @Override + public ClusterClientProvider deployApplicationCluster( + final ClusterSpecification clusterSpecification, final ApplicationConfiguration applicationConfiguration) + throws ClusterDeploymentException { + checkNotNull(clusterSpecification); + checkNotNull(applicationConfiguration); + + final YarnDeploymentTarget deploymentTarget = YarnDeploymentTarget.fromConfig(flinkConfiguration); + if (YarnDeploymentTarget.APPLICATION != deploymentTarget) { + throw new ClusterDeploymentException("Couldn't deploy Yarn Application Cluster." + + " Expected deployment.target=" + + YarnDeploymentTarget.APPLICATION.getName() + + " but actual one was \"" + + deploymentTarget.getName() + + "\""); + } + + applicationConfiguration.applyToConfiguration(flinkConfiguration); + + // No need to do pipelineJars validation if it is a PyFlink job. + if (!(PackagedProgramUtils.isPython(applicationConfiguration.getApplicationClassName()) + || PackagedProgramUtils.isPython(applicationConfiguration.getProgramArguments()))) { + final List pipelineJars = + flinkConfiguration.getOptional(PipelineOptions.JARS).orElse(Collections.emptyList()); + Preconditions.checkArgument(pipelineJars.size() == 1, "Should only have one jar"); + } + + try { + return deployInternal( + clusterSpecification, + "Flink Application Cluster", + YarnApplicationClusterEntryPoint.class.getName(), + null, + false); + } catch (Exception e) { + throw new ClusterDeploymentException("Couldn't deploy Yarn Application Cluster", e); + } + } + + @Override + public ClusterClientProvider deployJobCluster( + ClusterSpecification clusterSpecification, JobGraph jobGraph, boolean detached) + throws ClusterDeploymentException { + + LOG.warn( + "Job Clusters are deprecated since Flink 1.15. Please use an Application Cluster/Application Mode instead."); + try { + return deployInternal( + clusterSpecification, "Flink per-job cluster", getYarnJobClusterEntrypoint(), jobGraph, detached); + } catch (Exception e) { + throw new ClusterDeploymentException("Could not deploy Yarn job cluster.", e); + } + } + + @Override + public void killCluster(ApplicationId applicationId) throws FlinkException { + try { + yarnClient.killApplication(applicationId); + + try (final FileSystem fs = FileSystem.get(yarnConfiguration)) { + final Path applicationDir = + YarnApplicationFileUploader.getApplicationDirPath(getStagingDir(fs), applicationId); + + Utils.deleteApplicationFiles(applicationDir.toUri().toString()); + } + + } catch (YarnException | IOException e) { + throw new FlinkException("Could not kill the Yarn Flink cluster with id " + applicationId + '.', e); + } + } + + /** + * This method will block until the ApplicationMaster/JobManager have been deployed on YARN. + * + * @param clusterSpecification Initial cluster specification for the Flink cluster to be + * deployed + * @param applicationName name of the Yarn application to start + * @param yarnClusterEntrypoint Class name of the Yarn cluster entry point. + * @param jobGraph A job graph which is deployed with the Flink cluster, {@code null} if none + * @param detached True if the cluster should be started in detached mode + */ + private ClusterClientProvider deployInternal( + ClusterSpecification clusterSpecification, + String applicationName, + String yarnClusterEntrypoint, + @Nullable JobGraph jobGraph, + boolean detached) + throws Exception { + + final UserGroupInformation currentUser = UserGroupInformation.getCurrentUser(); + if (HadoopUtils.isKerberosSecurityEnabled(currentUser)) { + boolean useTicketCache = flinkConfiguration.getBoolean(SecurityOptions.KERBEROS_LOGIN_USETICKETCACHE); + + if (!HadoopUtils.areKerberosCredentialsValid(currentUser, useTicketCache)) { + throw new RuntimeException("Hadoop security with Kerberos is enabled but the login user " + + "does not have Kerberos credentials or delegation tokens!"); + } + + final boolean fetchToken = flinkConfiguration.getBoolean(SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN); + final boolean yarnAccessFSEnabled = !CollectionUtil.isNullOrEmpty( + flinkConfiguration.get(SecurityOptions.KERBEROS_HADOOP_FILESYSTEMS_TO_ACCESS)); + if (!fetchToken && yarnAccessFSEnabled) { + throw new IllegalConfigurationException(String.format( + "When %s is disabled, %s must be disabled as well.", + SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN.key(), + SecurityOptions.KERBEROS_HADOOP_FILESYSTEMS_TO_ACCESS.key())); + } + } + + isReadyForDeployment(clusterSpecification); + + // ------------------ Check if the specified queue exists -------------------- + + checkYarnQueues(yarnClient); + + // ------------------ Check if the YARN ClusterClient has the requested resources + // -------------- + + // Create application via yarnClient + final YarnClientApplication yarnApplication = yarnClient.createApplication(); + final GetNewApplicationResponse appResponse = yarnApplication.getNewApplicationResponse(); + + Resource maxRes = appResponse.getMaximumResourceCapability(); + + final ClusterResourceDescription freeClusterMem; + try { + freeClusterMem = getCurrentFreeClusterResources(yarnClient); + } catch (YarnException | IOException e) { + failSessionDuringDeployment(yarnClient, yarnApplication); + throw new YarnDeploymentException("Could not retrieve information about free cluster resources.", e); + } + + final int yarnMinAllocationMB = yarnConfiguration.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB); + if (yarnMinAllocationMB <= 0) { + throw new YarnDeploymentException("The minimum allocation memory " + + "(" + + yarnMinAllocationMB + + " MB) configured via '" + + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB + + "' should be greater than 0."); + } + + final ClusterSpecification validClusterSpecification; + try { + validClusterSpecification = + validateClusterResources(clusterSpecification, yarnMinAllocationMB, maxRes, freeClusterMem); + } catch (YarnDeploymentException yde) { + failSessionDuringDeployment(yarnClient, yarnApplication); + throw yde; + } + + LOG.info("Cluster specification: {}", validClusterSpecification); + + final ClusterEntrypoint.ExecutionMode executionMode = + detached ? ClusterEntrypoint.ExecutionMode.DETACHED : ClusterEntrypoint.ExecutionMode.NORMAL; + + flinkConfiguration.setString(ClusterEntrypoint.INTERNAL_CLUSTER_EXECUTION_MODE, executionMode.toString()); + + ApplicationReport report = startAppMaster( + flinkConfiguration, + applicationName, + yarnClusterEntrypoint, + jobGraph, + yarnClient, + yarnApplication, + validClusterSpecification); + + // print the application id for user to cancel themselves. + if (detached) { + final ApplicationId yarnApplicationId = report.getApplicationId(); + logDetachedClusterInformation(yarnApplicationId, LOG); + } + + setClusterEntrypointInfoToConfig(report); + + return () -> { + try { + return new RestClusterClient<>(flinkConfiguration, report.getApplicationId()); + } catch (Exception e) { + throw new RuntimeException("Error while creating RestClusterClient.", e); + } + }; + } + + private ClusterSpecification validateClusterResources( + ClusterSpecification clusterSpecification, + int yarnMinAllocationMB, + Resource maximumResourceCapability, + ClusterResourceDescription freeClusterResources) + throws YarnDeploymentException { + + int jobManagerMemoryMb = clusterSpecification.getMasterMemoryMB(); + final int taskManagerMemoryMb = clusterSpecification.getTaskManagerMemoryMB(); + + logIfComponentMemNotIntegerMultipleOfYarnMinAllocation("JobManager", jobManagerMemoryMb, yarnMinAllocationMB); + logIfComponentMemNotIntegerMultipleOfYarnMinAllocation("TaskManager", taskManagerMemoryMb, yarnMinAllocationMB); + + // set the memory to minAllocationMB to do the next checks correctly + if (jobManagerMemoryMb < yarnMinAllocationMB) { + jobManagerMemoryMb = yarnMinAllocationMB; + } + + final String note = + "Please check the 'yarn.scheduler.maximum-allocation-mb' and the 'yarn.nodemanager.resource.memory-mb' configuration values\n"; + if (jobManagerMemoryMb > maximumResourceCapability.getMemory()) { + throw new YarnDeploymentException( + "The cluster does not have the requested resources for the JobManager available!\n" + + "Maximum Memory: " + + maximumResourceCapability.getMemory() + + "MB Requested: " + + jobManagerMemoryMb + + "MB. " + + note); + } + + if (taskManagerMemoryMb > maximumResourceCapability.getMemory()) { + throw new YarnDeploymentException( + "The cluster does not have the requested resources for the TaskManagers available!\n" + + "Maximum Memory: " + + maximumResourceCapability.getMemory() + + " Requested: " + + taskManagerMemoryMb + + "MB. " + + note); + } + + final String noteRsc = + "\nThe Flink YARN client will try to allocate the YARN session, but maybe not all TaskManagers are " + + "connecting from the beginning because the resources are currently not available in the cluster. " + + "The allocation might take more time than usual because the Flink YARN client needs to wait until " + + "the resources become available."; + + if (taskManagerMemoryMb > freeClusterResources.containerLimit) { + LOG.warn("The requested amount of memory for the TaskManagers (" + + taskManagerMemoryMb + + "MB) is more than " + + "the largest possible YARN container: " + + freeClusterResources.containerLimit + + noteRsc); + } + if (jobManagerMemoryMb > freeClusterResources.containerLimit) { + LOG.warn("The requested amount of memory for the JobManager (" + + jobManagerMemoryMb + + "MB) is more than " + + "the largest possible YARN container: " + + freeClusterResources.containerLimit + + noteRsc); + } + + return new ClusterSpecification.ClusterSpecificationBuilder() + .setMasterMemoryMB(jobManagerMemoryMb) + .setTaskManagerMemoryMB(taskManagerMemoryMb) + .setSlotsPerTaskManager(clusterSpecification.getSlotsPerTaskManager()) + .createClusterSpecification(); + } + + private void logIfComponentMemNotIntegerMultipleOfYarnMinAllocation( + String componentName, int componentMemoryMB, int yarnMinAllocationMB) { + int normalizedMemMB = + (componentMemoryMB + (yarnMinAllocationMB - 1)) / yarnMinAllocationMB * yarnMinAllocationMB; + if (normalizedMemMB <= 0) { + normalizedMemMB = yarnMinAllocationMB; + } + if (componentMemoryMB != normalizedMemMB) { + LOG.info( + "The configured {} memory is {} MB. YARN will allocate {} MB to make up an integer multiple of its " + + "minimum allocation memory ({} MB, configured via 'yarn.scheduler.minimum-allocation-mb'). The extra {} MB " + + "may not be used by Flink.", + componentName, + componentMemoryMB, + normalizedMemMB, + yarnMinAllocationMB, + normalizedMemMB - componentMemoryMB); + } + } + + private void checkYarnQueues(YarnClient yarnClient) { + try { + List queues = yarnClient.getAllQueues(); + if (queues.size() > 0 + && this.yarnQueue != null) { // check only if there are queues configured in yarn and for + // this session. + boolean queueFound = false; + for (QueueInfo queue : queues) { + if (queue.getQueueName().equals(this.yarnQueue) + || queue.getQueueName().equals("root." + this.yarnQueue)) { + queueFound = true; + break; + } + } + if (!queueFound) { + String queueNames = StringUtils.toQuotedListString(queues.toArray()); + LOG.warn("The specified queue '" + + this.yarnQueue + + "' does not exist. " + + "Available queues: " + + queueNames); + } + } else { + LOG.debug("The YARN cluster does not have any queues configured"); + } + } catch (Throwable e) { + LOG.warn("Error while getting queue information from YARN: " + e.getMessage()); + if (LOG.isDebugEnabled()) { + LOG.debug("Error details", e); + } + } + } + + private ApplicationReport startAppMaster( + Configuration configuration, + String applicationName, + String yarnClusterEntrypoint, + JobGraph jobGraph, + YarnClient yarnClient, + YarnClientApplication yarnApplication, + ClusterSpecification clusterSpecification) + throws Exception { + + // ------------------ Initialize the file systems ------------------------- + + org.apache.flink.core.fs.FileSystem.initialize( + configuration, PluginUtils.createPluginManagerFromRootFolder(configuration)); + + final FileSystem fs = FileSystem.get(yarnConfiguration); + + // hard coded check for the GoogleHDFS client because its not overriding the getScheme() + // method. + if (!fs.getClass().getSimpleName().equals("GoogleHadoopFileSystem") + && fs.getScheme().startsWith("file")) { + LOG.warn("The file system scheme is '" + + fs.getScheme() + + "'. This indicates that the " + + "specified Hadoop configuration path is wrong and the system is using the default Hadoop configuration values." + + "The Flink YARN client needs to store its files in a distributed file system"); + } + + ApplicationSubmissionContext appContext = yarnApplication.getApplicationSubmissionContext(); + + final List providedLibDirs = Utils.getQualifiedRemoteProvidedLibDirs(configuration, yarnConfiguration); + + final Optional providedUsrLibDir = + Utils.getQualifiedRemoteProvidedUsrLib(configuration, yarnConfiguration); + + Path stagingDirPath = getStagingDir(fs); + FileSystem stagingDirFs = stagingDirPath.getFileSystem(yarnConfiguration); + final YarnApplicationFileUploader fileUploader = YarnApplicationFileUploader.from( + stagingDirFs, stagingDirPath, providedLibDirs, appContext.getApplicationId(), getFileReplication()); + + // The files need to be shipped and added to classpath. + Set systemShipFiles = new HashSet<>(shipFiles.size()); + for (File file : shipFiles) { + systemShipFiles.add(file.getAbsoluteFile()); + } + + final String logConfigFilePath = configuration.getString(YarnConfigOptionsInternal.APPLICATION_LOG_CONFIG_FILE); + if (logConfigFilePath != null) { + systemShipFiles.add(new File(logConfigFilePath)); + } + + // Set-up ApplicationSubmissionContext for the application + + final ApplicationId appId = appContext.getApplicationId(); + + // ------------------ Add Zookeeper namespace to local flinkConfiguraton ------ + setHAClusterIdIfNotSet(configuration, appId); + + if (HighAvailabilityMode.isHighAvailabilityModeActivated(configuration)) { + // activate re-execution of failed applications + appContext.setMaxAppAttempts(configuration.getInteger( + YarnConfigOptions.APPLICATION_ATTEMPTS.key(), YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)); + + activateHighAvailabilitySupport(appContext); + } else { + // set number of application retries to 1 in the default case + appContext.setMaxAppAttempts(configuration.getInteger(YarnConfigOptions.APPLICATION_ATTEMPTS.key(), 1)); + } + + final Set userJarFiles = new HashSet<>(); + if (jobGraph != null) { + userJarFiles.addAll(jobGraph.getUserJars().stream() + .map(f -> f.toUri()) + .map(Path::new) + .collect(Collectors.toSet())); + } + + final List jarUrls = ConfigUtils.decodeListFromConfig(configuration, PipelineOptions.JARS, URI::create); + if (jarUrls != null && YarnApplicationClusterEntryPoint.class.getName().equals(yarnClusterEntrypoint)) { + userJarFiles.addAll(jarUrls.stream().map(Path::new).collect(Collectors.toSet())); + } + + // only for per job mode + if (jobGraph != null) { + for (Map.Entry entry : + jobGraph.getUserArtifacts().entrySet()) { + // only upload local files + if (!Utils.isRemotePath(entry.getValue().filePath)) { + Path localPath = new Path(entry.getValue().filePath); + Tuple2 remoteFileInfo = fileUploader.uploadLocalFileToRemote(localPath, entry.getKey()); + jobGraph.setUserArtifactRemotePath(entry.getKey(), remoteFileInfo.f0.toString()); + } + } + + jobGraph.writeUserArtifactEntriesToConfiguration(); + } + + if (providedLibDirs == null || providedLibDirs.isEmpty()) { + addLibFoldersToShipFiles(systemShipFiles); + } + + // Register all files in provided lib dirs as local resources with public visibility + // and upload the remaining dependencies as local resources with APPLICATION visibility. + final List systemClassPaths = fileUploader.registerProvidedLocalResources(); + final List uploadedDependencies = fileUploader.registerMultipleLocalResources( + systemShipFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.FILE); + systemClassPaths.addAll(uploadedDependencies); + + // upload and register ship-only files + // Plugin files only need to be shipped and should not be added to classpath. + if (providedLibDirs == null || providedLibDirs.isEmpty()) { + Set shipOnlyFiles = new HashSet<>(); + addPluginsFoldersToShipFiles(shipOnlyFiles); + fileUploader.registerMultipleLocalResources( + shipOnlyFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.FILE); + } + + if (!shipArchives.isEmpty()) { + fileUploader.registerMultipleLocalResources( + shipArchives.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.ARCHIVE); + } + + // only for application mode + // Python jar file only needs to be shipped and should not be added to classpath. + if (YarnApplicationClusterEntryPoint.class.getName().equals(yarnClusterEntrypoint) + && PackagedProgramUtils.isPython(configuration.get(APPLICATION_MAIN_CLASS))) { + fileUploader.registerMultipleLocalResources( + Collections.singletonList( + new Path(PackagedProgramUtils.getPythonJar().toURI())), + ConfigConstants.DEFAULT_FLINK_OPT_DIR, + LocalResourceType.FILE); + } + + // Upload and register user jars + final List userClassPaths = fileUploader.registerMultipleLocalResources( + userJarFiles, + userJarInclusion == YarnConfigOptions.UserJarInclusion.DISABLED + ? ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR + : Path.CUR_DIR, + LocalResourceType.FILE); + + // usrlib in remote will be used first. + if (providedUsrLibDir.isPresent()) { + final List usrLibClassPaths = fileUploader.registerMultipleLocalResources( + Collections.singletonList(providedUsrLibDir.get()), Path.CUR_DIR, LocalResourceType.FILE); + userClassPaths.addAll(usrLibClassPaths); + } else if (ClusterEntrypointUtils.tryFindUserLibDirectory().isPresent()) { + // local usrlib will be automatically shipped if it exists and there is no remote + // usrlib. + final Set usrLibShipFiles = new HashSet<>(); + addUsrLibFolderToShipFiles(usrLibShipFiles); + final List usrLibClassPaths = fileUploader.registerMultipleLocalResources( + usrLibShipFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.FILE); + userClassPaths.addAll(usrLibClassPaths); + } + + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.ORDER) { + systemClassPaths.addAll(userClassPaths); + } + + // normalize classpath by sorting + Collections.sort(systemClassPaths); + Collections.sort(userClassPaths); + + // classpath assembler + StringBuilder classPathBuilder = new StringBuilder(); + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.FIRST) { + for (String userClassPath : userClassPaths) { + classPathBuilder.append(userClassPath).append(pathSeparator); + } + } + for (String classPath : systemClassPaths) { + classPathBuilder.append(classPath).append(pathSeparator); + } + + // Setup jar for ApplicationMaster + final YarnLocalResourceDescriptor localResourceDescFlinkJar = fileUploader.uploadFlinkDist(flinkJarPath); + classPathBuilder.append(localResourceDescFlinkJar.getResourceKey()).append(pathSeparator); + + // write job graph to tmp file and add it to local resource + // TODO: server use user main method to generate job graph + if (jobGraph != null) { + File tmpJobGraphFile = null; + try { + tmpJobGraphFile = File.createTempFile(appId.toString(), null); + try (FileOutputStream output = new FileOutputStream(tmpJobGraphFile); + ObjectOutputStream obOutput = new ObjectOutputStream(output)) { + obOutput.writeObject(jobGraph); + } + + final String jobGraphFilename = "job.graph"; + configuration.setString(JOB_GRAPH_FILE_PATH, jobGraphFilename); + + fileUploader.registerSingleLocalResource( + jobGraphFilename, new Path(tmpJobGraphFile.toURI()), "", LocalResourceType.FILE, true, false); + classPathBuilder.append(jobGraphFilename).append(pathSeparator); + } catch (Exception e) { + LOG.warn("Add job graph to local resource fail."); + throw e; + } finally { + if (tmpJobGraphFile != null && !tmpJobGraphFile.delete()) { + LOG.warn("Fail to delete temporary file {}.", tmpJobGraphFile.toPath()); + } + } + } + + // Upload the flink configuration + // write out configuration file + File tmpConfigurationFile = null; + try { + tmpConfigurationFile = File.createTempFile(appId + "-flink-conf.yaml", null); + + // remove localhost bind hosts as they render production clusters unusable + removeLocalhostBindHostSetting(configuration, JobManagerOptions.BIND_HOST); + removeLocalhostBindHostSetting(configuration, TaskManagerOptions.BIND_HOST); + // this setting is unconditionally overridden anyway, so we remove it for clarity + configuration.removeConfig(TaskManagerOptions.HOST); + + BootstrapTools.writeConfiguration(configuration, tmpConfigurationFile); + + String flinkConfigKey = "flink-conf.yaml"; + fileUploader.registerSingleLocalResource( + flinkConfigKey, + new Path(tmpConfigurationFile.getAbsolutePath()), + "", + LocalResourceType.FILE, + true, + true); + classPathBuilder.append("flink-conf.yaml").append(pathSeparator); + } finally { + if (tmpConfigurationFile != null && !tmpConfigurationFile.delete()) { + LOG.warn("Fail to delete temporary file {}.", tmpConfigurationFile.toPath()); + } + } + + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.LAST) { + for (String userClassPath : userClassPaths) { + classPathBuilder.append(userClassPath).append(pathSeparator); + } + } + + // To support Yarn Secure Integration Test Scenario + // In Integration test setup, the Yarn containers created by YarnMiniCluster does not have + // the Yarn site XML + // and KRB5 configuration files. We are adding these files as container local resources for + // the container + // applications (JM/TMs) to have proper secure cluster setup + Path remoteYarnSiteXmlPath = null; + if (System.getenv("IN_TESTS") != null) { + File f = new File(System.getenv("YARN_CONF_DIR"), Utils.YARN_SITE_FILE_NAME); + LOG.info("Adding Yarn configuration {} to the AM container local resource bucket", f.getAbsolutePath()); + Path yarnSitePath = new Path(f.getAbsolutePath()); + remoteYarnSiteXmlPath = fileUploader + .registerSingleLocalResource( + Utils.YARN_SITE_FILE_NAME, yarnSitePath, "", LocalResourceType.FILE, false, false) + .getPath(); + if (System.getProperty("java.security.krb5.conf") != null) { + configuration.set(SecurityOptions.KERBEROS_KRB5_PATH, System.getProperty("java.security.krb5.conf")); + } + } + + Path remoteKrb5Path = null; + boolean hasKrb5 = false; + String krb5Config = configuration.get(SecurityOptions.KERBEROS_KRB5_PATH); + if (!StringUtils.isNullOrWhitespaceOnly(krb5Config)) { + final File krb5 = new File(krb5Config); + LOG.info("Adding KRB5 configuration {} to the AM container local resource bucket", krb5.getAbsolutePath()); + final Path krb5ConfPath = new Path(krb5.getAbsolutePath()); + remoteKrb5Path = fileUploader + .registerSingleLocalResource( + Utils.KRB5_FILE_NAME, krb5ConfPath, "", LocalResourceType.FILE, false, false) + .getPath(); + hasKrb5 = true; + } + + Path remotePathKeytab = null; + String localizedKeytabPath = null; + String keytab = configuration.getString(SecurityOptions.KERBEROS_LOGIN_KEYTAB); + if (keytab != null) { + boolean localizeKeytab = flinkConfiguration.getBoolean(YarnConfigOptions.SHIP_LOCAL_KEYTAB); + localizedKeytabPath = flinkConfiguration.getString(YarnConfigOptions.LOCALIZED_KEYTAB_PATH); + if (localizeKeytab) { + // Localize the keytab to YARN containers via local resource. + LOG.info("Adding keytab {} to the AM container local resource bucket", keytab); + remotePathKeytab = fileUploader + .registerSingleLocalResource( + localizedKeytabPath, new Path(keytab), "", LocalResourceType.FILE, false, false) + .getPath(); + } else { + // // Assume Keytab is pre-installed in the container. + localizedKeytabPath = flinkConfiguration.getString(YarnConfigOptions.LOCALIZED_KEYTAB_PATH); + } + } + + final JobManagerProcessSpec processSpec = + JobManagerProcessUtils.processSpecFromConfigWithNewOptionToInterpretLegacyHeap( + flinkConfiguration, JobManagerOptions.TOTAL_PROCESS_MEMORY); + final ContainerLaunchContext amContainer = + setupApplicationMasterContainer(yarnClusterEntrypoint, hasKrb5, processSpec); + + boolean fetchToken = configuration.getBoolean(SecurityOptions.DELEGATION_TOKENS_ENABLED); + KerberosLoginProvider kerberosLoginProvider = new KerberosLoginProvider(configuration); + if (kerberosLoginProvider.isLoginPossible(true)) { + setTokensFor(amContainer, fetchToken); + } else { + LOG.info("Cannot use kerberos delegation token manager, no valid kerberos credentials provided."); + } + + amContainer.setLocalResources(fileUploader.getRegisteredLocalResources()); + fileUploader.close(); + + // Setup CLASSPATH and environment variables for ApplicationMaster + final Map appMasterEnv = generateApplicationMasterEnv( + fileUploader, classPathBuilder.toString(), localResourceDescFlinkJar.toString(), appId.toString()); + + if (localizedKeytabPath != null) { + appMasterEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localizedKeytabPath); + String principal = configuration.getString(SecurityOptions.KERBEROS_LOGIN_PRINCIPAL); + appMasterEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, principal); + if (remotePathKeytab != null) { + appMasterEnv.put(YarnConfigKeys.REMOTE_KEYTAB_PATH, remotePathKeytab.toString()); + } + } + + // To support Yarn Secure Integration Test Scenario + if (remoteYarnSiteXmlPath != null) { + appMasterEnv.put(YarnConfigKeys.ENV_YARN_SITE_XML_PATH, remoteYarnSiteXmlPath.toString()); + } + if (remoteKrb5Path != null) { + appMasterEnv.put(YarnConfigKeys.ENV_KRB5_PATH, remoteKrb5Path.toString()); + } + + amContainer.setEnvironment(appMasterEnv); + + // Set up resource type requirements for ApplicationMaster + Resource capability = Records.newRecord(Resource.class); + capability.setMemory(clusterSpecification.getMasterMemoryMB()); + capability.setVirtualCores(flinkConfiguration.getInteger(YarnConfigOptions.APP_MASTER_VCORES)); + + final String customApplicationName = customName != null ? customName : applicationName; + + appContext.setApplicationName(customApplicationName); + appContext.setApplicationType(applicationType != null ? applicationType : "Apache Flink"); + appContext.setAMContainerSpec(amContainer); + appContext.setResource(capability); + + // Set priority for application + int priorityNum = flinkConfiguration.getInteger(YarnConfigOptions.APPLICATION_PRIORITY); + if (priorityNum >= 0) { + Priority priority = Priority.newInstance(priorityNum); + appContext.setPriority(priority); + } + + if (yarnQueue != null) { + appContext.setQueue(yarnQueue); + } + + setApplicationNodeLabel(appContext); + + setApplicationTags(appContext); + + // add a hook to clean up in case deployment fails + Thread deploymentFailureHook = new DeploymentFailureHook(yarnApplication, fileUploader.getApplicationDir()); + Runtime.getRuntime().addShutdownHook(deploymentFailureHook); + LOG.info("Submitting application master " + appId); + yarnClient.submitApplication(appContext); + + LOG.info("Waiting for the cluster to be allocated"); + final long startTime = System.currentTimeMillis(); + long lastLogTime = System.currentTimeMillis(); + ApplicationReport report; + YarnApplicationState lastAppState = YarnApplicationState.NEW; + loop: + while (true) { + try { + report = yarnClient.getApplicationReport(appId); + } catch (IOException e) { + throw new YarnDeploymentException("Failed to deploy the cluster.", e); + } + YarnApplicationState appState = report.getYarnApplicationState(); + LOG.debug("Application State: {}", appState); + switch (appState) { + case FAILED: + case KILLED: + throw new YarnDeploymentException("The YARN application unexpectedly switched to state " + + appState + + " during deployment. \n" + + "Diagnostics from YARN: " + + report.getDiagnostics() + + "\n" + + "If log aggregation is enabled on your cluster, use this command to further investigate the issue:\n" + + "yarn logs -applicationId " + + appId); + // break .. + case RUNNING: + LOG.info("YARN application has been deployed successfully."); + break loop; + case FINISHED: + LOG.info("YARN application has been finished successfully."); + break loop; + default: + if (appState != lastAppState) { + LOG.info("Deploying cluster, current state " + appState); + } + if (System.currentTimeMillis() - lastLogTime > 60000) { + lastLogTime = System.currentTimeMillis(); + LOG.info( + "Deployment took more than {} seconds. Please check if the requested resources are available in the YARN cluster", + (lastLogTime - startTime) / 1000); + } + } + lastAppState = appState; + Thread.sleep(250); + } + + // since deployment was successful, remove the hook + ShutdownHookUtil.removeShutdownHook(deploymentFailureHook, getClass().getSimpleName(), LOG); + return report; + } + + private void removeLocalhostBindHostSetting(Configuration configuration, ConfigOption option) { + configuration + .getOptional(option) + .filter(bindHost -> bindHost.equals("localhost")) + .ifPresent(bindHost -> { + LOG.info( + "Removing 'localhost' {} setting from effective configuration; using '0.0.0.0' instead.", + option); + configuration.removeConfig(option); + }); + } + + private void setTokensFor(ContainerLaunchContext containerLaunchContext, boolean fetchToken) throws Exception { + Credentials credentials = new Credentials(); + + LOG.info("Loading delegation tokens available locally to add to the AM container"); + // for user + UserGroupInformation currUsr = UserGroupInformation.getCurrentUser(); + + Collection> usrTok = + currUsr.getCredentials().getAllTokens(); + for (Token token : usrTok) { + LOG.info("Adding user token " + token.getService() + " with " + token); + credentials.addToken(token.getService(), token); + } + + if (fetchToken) { + LOG.info("Fetching delegation tokens to add to the AM container."); + DelegationTokenManager delegationTokenManager = + new DefaultDelegationTokenManager(flinkConfiguration, null, null, null); + DelegationTokenContainer container = new DelegationTokenContainer(); + delegationTokenManager.obtainDelegationTokens(container); + + // This is here for backward compatibility to make log aggregation work + for (Map.Entry e : container.getTokens().entrySet()) { + if (e.getKey().equals("hadoopfs")) { + credentials.addAll(HadoopDelegationTokenConverter.deserialize(e.getValue())); + } + } + } + + ByteBuffer tokens = ByteBuffer.wrap(HadoopDelegationTokenConverter.serialize(credentials)); + containerLaunchContext.setTokens(tokens); + + LOG.info("Delegation tokens added to the AM container."); + } + + /** + * Returns the configured remote target home directory if set, otherwise returns the default + * home directory. + * + * @param defaultFileSystem default file system used + * @return the remote target home directory + */ + @VisibleForTesting + Path getStagingDir(FileSystem defaultFileSystem) throws IOException { + final String configuredStagingDir = flinkConfiguration.getString(YarnConfigOptions.STAGING_DIRECTORY); + if (configuredStagingDir == null) { + return defaultFileSystem.getHomeDirectory(); + } + FileSystem stagingDirFs = new Path(configuredStagingDir).getFileSystem(defaultFileSystem.getConf()); + return stagingDirFs.makeQualified(new Path(configuredStagingDir)); + } + + private int getFileReplication() { + final int yarnFileReplication = + yarnConfiguration.getInt(DFSConfigKeys.DFS_REPLICATION_KEY, DFSConfigKeys.DFS_REPLICATION_DEFAULT); + final int fileReplication = flinkConfiguration.getInteger(YarnConfigOptions.FILE_REPLICATION); + return fileReplication > 0 ? fileReplication : yarnFileReplication; + } + + private static String encodeYarnLocalResourceDescriptorListToString(List resources) { + return String.join( + LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR, + resources.stream().map(YarnLocalResourceDescriptor::toString).collect(Collectors.toList())); + } + + /** + * Kills YARN application and stops YARN client. + * + *

Use this method to kill the App before it has been properly deployed + */ + private void failSessionDuringDeployment(YarnClient yarnClient, YarnClientApplication yarnApplication) { + LOG.info("Killing YARN application"); + + try { + yarnClient.killApplication( + yarnApplication.getNewApplicationResponse().getApplicationId()); + } catch (Exception e) { + // we only log a debug message here because the "killApplication" call is a best-effort + // call (we don't know if the application has been deployed when the error occurred). + LOG.debug("Error while killing YARN application", e); + } + } + + private static class ClusterResourceDescription { + public final int totalFreeMemory; + public final int containerLimit; + public final int[] nodeManagersFree; + + public ClusterResourceDescription(int totalFreeMemory, int containerLimit, int[] nodeManagersFree) { + this.totalFreeMemory = totalFreeMemory; + this.containerLimit = containerLimit; + this.nodeManagersFree = nodeManagersFree; + } + } + + private ClusterResourceDescription getCurrentFreeClusterResources(YarnClient yarnClient) + throws YarnException, IOException { + List nodes = yarnClient.getNodeReports(NodeState.RUNNING); + + int totalFreeMemory = 0; + int containerLimit = 0; + int[] nodeManagersFree = new int[nodes.size()]; + + for (int i = 0; i < nodes.size(); i++) { + NodeReport rep = nodes.get(i); + int free = rep.getCapability().getMemory() + - (rep.getUsed() != null ? rep.getUsed().getMemory() : 0); + nodeManagersFree[i] = free; + totalFreeMemory += free; + if (free > containerLimit) { + containerLimit = free; + } + } + return new ClusterResourceDescription(totalFreeMemory, containerLimit, nodeManagersFree); + } + + @Override + public String getClusterDescription() { + + try { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + PrintStream ps = new PrintStream(baos); + + YarnClusterMetrics metrics = yarnClient.getYarnClusterMetrics(); + + ps.append("NodeManagers in the ClusterClient " + metrics.getNumNodeManagers()); + List nodes = yarnClient.getNodeReports(NodeState.RUNNING); + final String format = "|%-16s |%-16s %n"; + ps.printf("|Property |Value %n"); + ps.println("+---------------------------------------+"); + int totalMemory = 0; + int totalCores = 0; + for (NodeReport rep : nodes) { + final Resource res = rep.getCapability(); + totalMemory += res.getMemory(); + totalCores += res.getVirtualCores(); + ps.format(format, "NodeID", rep.getNodeId()); + ps.format(format, "Memory", res.getMemory() + " MB"); + ps.format(format, "vCores", res.getVirtualCores()); + ps.format(format, "HealthReport", rep.getHealthReport()); + ps.format(format, "Containers", rep.getNumContainers()); + ps.println("+---------------------------------------+"); + } + ps.println("Summary: totalMemory " + totalMemory + " totalCores " + totalCores); + List qInfo = yarnClient.getAllQueues(); + for (QueueInfo q : qInfo) { + ps.println("Queue: " + + q.getQueueName() + + ", Current Capacity: " + + q.getCurrentCapacity() + + " Max Capacity: " + + q.getMaximumCapacity() + + " Applications: " + + q.getApplications().size()); + } + return baos.toString(); + } catch (Exception e) { + throw new RuntimeException("Couldn't get cluster description", e); + } + } + + private void activateHighAvailabilitySupport(ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + + reflector.setKeepContainersAcrossApplicationAttempts(appContext, true); + + reflector.setAttemptFailuresValidityInterval( + appContext, + flinkConfiguration.getLong(YarnConfigOptions.APPLICATION_ATTEMPT_FAILURE_VALIDITY_INTERVAL)); + } + + private void setApplicationTags(final ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + final ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + final String tagsString = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_TAGS); + + final Set applicationTags = new HashSet<>(); + + // Trim whitespace and cull empty tags + for (final String tag : tagsString.split(",")) { + final String trimmedTag = tag.trim(); + if (!trimmedTag.isEmpty()) { + applicationTags.add(trimmedTag); + } + } + + reflector.setApplicationTags(appContext, applicationTags); + } + + private void setApplicationNodeLabel(final ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + if (nodeLabel != null) { + final ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + reflector.setApplicationNodeLabel(appContext, nodeLabel); + } + } + + /** + * Singleton object which uses reflection to determine whether the {@link + * ApplicationSubmissionContext} supports various methods which, depending on the Hadoop + * version, may or may not be supported. + * + *

If an unsupported method is invoked, nothing happens. + * + *

Currently three methods are proxied: - setApplicationTags (>= 2.4.0) - + * setAttemptFailuresValidityInterval (>= 2.6.0) - setKeepContainersAcrossApplicationAttempts + * (>= 2.4.0) - setNodeLabelExpression (>= 2.6.0) + */ + private static class ApplicationSubmissionContextReflector { + private static final Logger LOG = LoggerFactory.getLogger(ApplicationSubmissionContextReflector.class); + + private static final ApplicationSubmissionContextReflector instance = + new ApplicationSubmissionContextReflector(ApplicationSubmissionContext.class); + + public static ApplicationSubmissionContextReflector getInstance() { + return instance; + } + + private static final String APPLICATION_TAGS_METHOD_NAME = "setApplicationTags"; + private static final String ATTEMPT_FAILURES_METHOD_NAME = "setAttemptFailuresValidityInterval"; + private static final String KEEP_CONTAINERS_METHOD_NAME = "setKeepContainersAcrossApplicationAttempts"; + private static final String NODE_LABEL_EXPRESSION_NAME = "setNodeLabelExpression"; + + private final Method applicationTagsMethod; + private final Method attemptFailuresValidityIntervalMethod; + private final Method keepContainersMethod; + + @Nullable + private final Method nodeLabelExpressionMethod; + + private ApplicationSubmissionContextReflector(Class clazz) { + Method applicationTagsMethod; + Method attemptFailuresValidityIntervalMethod; + Method keepContainersMethod; + Method nodeLabelExpressionMethod; + + try { + // this method is only supported by Hadoop 2.4.0 onwards + applicationTagsMethod = clazz.getMethod(APPLICATION_TAGS_METHOD_NAME, Set.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), APPLICATION_TAGS_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), APPLICATION_TAGS_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + applicationTagsMethod = null; + } + + this.applicationTagsMethod = applicationTagsMethod; + + try { + // this method is only supported by Hadoop 2.6.0 onwards + attemptFailuresValidityIntervalMethod = clazz.getMethod(ATTEMPT_FAILURES_METHOD_NAME, long.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), ATTEMPT_FAILURES_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), ATTEMPT_FAILURES_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + attemptFailuresValidityIntervalMethod = null; + } + + this.attemptFailuresValidityIntervalMethod = attemptFailuresValidityIntervalMethod; + + try { + // this method is only supported by Hadoop 2.4.0 onwards + keepContainersMethod = clazz.getMethod(KEEP_CONTAINERS_METHOD_NAME, boolean.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), KEEP_CONTAINERS_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), KEEP_CONTAINERS_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + keepContainersMethod = null; + } + + this.keepContainersMethod = keepContainersMethod; + + try { + nodeLabelExpressionMethod = clazz.getMethod(NODE_LABEL_EXPRESSION_NAME, String.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), NODE_LABEL_EXPRESSION_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), NODE_LABEL_EXPRESSION_NAME); + nodeLabelExpressionMethod = null; + } + + this.nodeLabelExpressionMethod = nodeLabelExpressionMethod; + } + + public void setApplicationTags(ApplicationSubmissionContext appContext, Set applicationTags) + throws InvocationTargetException, IllegalAccessException { + if (applicationTagsMethod != null) { + LOG.debug( + "Calling method {} of {}.", + applicationTagsMethod.getName(), + appContext.getClass().getCanonicalName()); + applicationTagsMethod.invoke(appContext, applicationTags); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + APPLICATION_TAGS_METHOD_NAME); + } + } + + public void setApplicationNodeLabel(ApplicationSubmissionContext appContext, String nodeLabel) + throws InvocationTargetException, IllegalAccessException { + if (nodeLabelExpressionMethod != null) { + LOG.debug( + "Calling method {} of {}.", + nodeLabelExpressionMethod.getName(), + appContext.getClass().getCanonicalName()); + nodeLabelExpressionMethod.invoke(appContext, nodeLabel); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + NODE_LABEL_EXPRESSION_NAME); + } + } + + public void setAttemptFailuresValidityInterval(ApplicationSubmissionContext appContext, long validityInterval) + throws InvocationTargetException, IllegalAccessException { + if (attemptFailuresValidityIntervalMethod != null) { + LOG.debug( + "Calling method {} of {}.", + attemptFailuresValidityIntervalMethod.getName(), + appContext.getClass().getCanonicalName()); + attemptFailuresValidityIntervalMethod.invoke(appContext, validityInterval); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + ATTEMPT_FAILURES_METHOD_NAME); + } + } + + public void setKeepContainersAcrossApplicationAttempts( + ApplicationSubmissionContext appContext, boolean keepContainers) + throws InvocationTargetException, IllegalAccessException { + + if (keepContainersMethod != null) { + LOG.debug( + "Calling method {} of {}.", + keepContainersMethod.getName(), + appContext.getClass().getCanonicalName()); + keepContainersMethod.invoke(appContext, keepContainers); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + KEEP_CONTAINERS_METHOD_NAME); + } + } + } + + private static class YarnDeploymentException extends RuntimeException { + private static final long serialVersionUID = -812040641215388943L; + + public YarnDeploymentException(String message) { + super(message); + } + + public YarnDeploymentException(String message, Throwable cause) { + super(message, cause); + } + } + + private class DeploymentFailureHook extends Thread { + + private final YarnClient yarnClient; + private final YarnClientApplication yarnApplication; + private final Path yarnFilesDir; + + DeploymentFailureHook(YarnClientApplication yarnApplication, Path yarnFilesDir) { + this.yarnApplication = Preconditions.checkNotNull(yarnApplication); + this.yarnFilesDir = Preconditions.checkNotNull(yarnFilesDir); + + // A new yarn client need to be created in shutdown hook in order to avoid + // the yarn client has been closed by YarnClusterDescriptor. + this.yarnClient = YarnClient.createYarnClient(); + this.yarnClient.init(yarnConfiguration); + } + + @Override + public void run() { + LOG.info("Cancelling deployment from Deployment Failure Hook"); + yarnClient.start(); + failSessionDuringDeployment(yarnClient, yarnApplication); + yarnClient.stop(); + LOG.info("Deleting files in {}.", yarnFilesDir); + try { + FileSystem fs = FileSystem.get(yarnConfiguration); + + if (!fs.delete(yarnFilesDir, true)) { + throw new IOException("Deleting files in " + yarnFilesDir + " was unsuccessful"); + } + + fs.close(); + } catch (IOException e) { + LOG.error("Failed to delete Flink Jar and configuration files in HDFS", e); + } + } + } + + @VisibleForTesting + void addLibFoldersToShipFiles(Collection effectiveShipFiles) { + // Add lib folder to the ship files if the environment variable is set. + // This is for convenience when running from the command-line. + // (for other files users explicitly set the ship files) + String libDir = System.getenv().get(ENV_FLINK_LIB_DIR); + if (libDir != null) { + File directoryFile = new File(libDir); + if (directoryFile.isDirectory()) { + effectiveShipFiles.add(directoryFile); + } else { + throw new YarnDeploymentException("The environment variable '" + + ENV_FLINK_LIB_DIR + + "' is set to '" + + libDir + + "' but the directory doesn't exist."); + } + } else if (shipFiles.isEmpty()) { + LOG.warn( + "Environment variable '{}' not set and ship files have not been provided manually. " + + "Not shipping any library files.", + ENV_FLINK_LIB_DIR); + } + } + + @VisibleForTesting + void addUsrLibFolderToShipFiles(Collection effectiveShipFiles) { + // Add usrlib folder to the ship files if it exists + // Classes in the folder will be loaded by UserClassLoader if CLASSPATH_INCLUDE_USER_JAR is + // DISABLED. + ClusterEntrypointUtils.tryFindUserLibDirectory().ifPresent(usrLibDirFile -> { + effectiveShipFiles.add(usrLibDirFile); + LOG.info("usrlib: {} will be shipped automatically.", usrLibDirFile.getAbsolutePath()); + }); + } + + @VisibleForTesting + void addPluginsFoldersToShipFiles(Collection effectiveShipFiles) { + final Optional pluginsDir = PluginConfig.getPluginsDir(); + pluginsDir.ifPresent(effectiveShipFiles::add); + } + + ContainerLaunchContext setupApplicationMasterContainer( + String yarnClusterEntrypoint, boolean hasKrb5, JobManagerProcessSpec processSpec) { + // ------------------ Prepare Application Master Container ------------------------------ + + // respect custom JVM options in the YAML file + String javaOpts = flinkConfiguration.getString(CoreOptions.FLINK_JVM_OPTIONS); + if (flinkConfiguration.getString(CoreOptions.FLINK_JM_JVM_OPTIONS).length() > 0) { + javaOpts += " " + flinkConfiguration.getString(CoreOptions.FLINK_JM_JVM_OPTIONS); + } + + // krb5.conf file will be available as local resource in JM/TM container + if (hasKrb5) { + javaOpts += " -Djava.security.krb5.conf=krb5.conf"; + } + + // Set up the container launch context for the application master + ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); + + final Map startCommandValues = new HashMap<>(); + startCommandValues.put("java", "$JAVA_HOME/bin/java"); + + String jvmHeapMem = JobManagerProcessUtils.generateJvmParametersStr(processSpec, flinkConfiguration); + startCommandValues.put("jvmmem", jvmHeapMem); + + startCommandValues.put("jvmopts", javaOpts); + startCommandValues.put("logging", YarnLogConfigUtil.getLoggingYarnCommand(flinkConfiguration)); + + startCommandValues.put("class", yarnClusterEntrypoint); + startCommandValues.put( + "redirects", + "1> " + + ApplicationConstants.LOG_DIR_EXPANSION_VAR + + "/jobmanager.out " + + "2> " + + ApplicationConstants.LOG_DIR_EXPANSION_VAR + + "/jobmanager.err"); + String dynamicParameterListStr = JobManagerProcessUtils.generateDynamicConfigsStr(processSpec); + startCommandValues.put("args", dynamicParameterListStr); + + final String commandTemplate = flinkConfiguration.getString( + ConfigConstants.YARN_CONTAINER_START_COMMAND_TEMPLATE, + ConfigConstants.DEFAULT_YARN_CONTAINER_START_COMMAND_TEMPLATE); + final String amCommand = BootstrapTools.getStartCommand(commandTemplate, startCommandValues); + + amContainer.setCommands(Collections.singletonList(amCommand)); + + LOG.debug("Application Master start command: " + amCommand); + + return amContainer; + } + + private static YarnConfigOptions.UserJarInclusion getUserJarInclusionMode( + org.apache.flink.configuration.Configuration config) { + return config.get(YarnConfigOptions.CLASSPATH_INCLUDE_USER_JAR); + } + + private static boolean isUsrLibDirIncludedInShipFiles(List shipFiles) { + return shipFiles.stream() + .filter(File::isDirectory) + .map(File::getName) + .anyMatch(name -> name.equals(DEFAULT_FLINK_USR_LIB_DIR)); + } + + private void setClusterEntrypointInfoToConfig(final ApplicationReport report) { + checkNotNull(report); + + final ApplicationId appId = report.getApplicationId(); + final String host = report.getHost(); + final int port = report.getRpcPort(); + + LOG.info("Found Web Interface {}:{} of application '{}'.", host, port, appId); + + flinkConfiguration.setString(JobManagerOptions.ADDRESS, host); + flinkConfiguration.setInteger(JobManagerOptions.PORT, port); + + flinkConfiguration.setString(RestOptions.ADDRESS, host); + flinkConfiguration.setInteger(RestOptions.PORT, port); + + flinkConfiguration.set(YarnConfigOptions.APPLICATION_ID, ConverterUtils.toString(appId)); + + setHAClusterIdIfNotSet(flinkConfiguration, appId); + } + + private void setHAClusterIdIfNotSet(Configuration configuration, ApplicationId appId) { + // set cluster-id to app id if not specified + if (!configuration.contains(HighAvailabilityOptions.HA_CLUSTER_ID)) { + configuration.set(HighAvailabilityOptions.HA_CLUSTER_ID, ConverterUtils.toString(appId)); + } + } + + public static void logDetachedClusterInformation(ApplicationId yarnApplicationId, Logger logger) { + logger.info( + "The Flink YARN session cluster has been started in detached mode. In order to " + + "stop Flink gracefully, use the following command:\n" + + "$ echo \"stop\" | ./bin/yarn-session.sh -id {}\n" + + "If this should not be possible, then you can also kill Flink via YARN's web interface or via:\n" + + "$ yarn application -kill {}\n" + + "Note that killing Flink might not clean up all job artifacts and temporary files.", + yarnApplicationId, + yarnApplicationId); + } + + @VisibleForTesting + Map generateApplicationMasterEnv( + final YarnApplicationFileUploader fileUploader, + final String classPathStr, + final String localFlinkJarStr, + final String appIdStr) + throws IOException { + final Map env = new HashMap<>(); + // set user specified app master environment variables + env.putAll(ConfigurationUtils.getPrefixedKeyValuePairs( + ResourceManagerOptions.CONTAINERIZED_MASTER_ENV_PREFIX, this.flinkConfiguration)); + // set Flink app class path + env.put(ENV_FLINK_CLASSPATH, classPathStr); + // Set FLINK_LIB_DIR to `lib` folder under working dir in container + env.put(ENV_FLINK_LIB_DIR, Path.CUR_DIR + "/" + ConfigConstants.DEFAULT_FLINK_LIB_DIR); + // Set FLINK_OPT_DIR to `opt` folder under working dir in container + env.put(ENV_FLINK_OPT_DIR, Path.CUR_DIR + "/" + ConfigConstants.DEFAULT_FLINK_OPT_DIR); + // set Flink on YARN internal configuration values + env.put(YarnConfigKeys.FLINK_DIST_JAR, localFlinkJarStr); + env.put(YarnConfigKeys.ENV_APP_ID, appIdStr); + env.put(YarnConfigKeys.ENV_CLIENT_HOME_DIR, fileUploader.getHomeDir().toString()); + env.put( + YarnConfigKeys.ENV_CLIENT_SHIP_FILES, + encodeYarnLocalResourceDescriptorListToString(fileUploader.getEnvShipResourceList())); + env.put( + YarnConfigKeys.FLINK_YARN_FILES, + fileUploader.getApplicationDir().toUri().toString()); + // https://github.com/apache/hadoop/blob/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/YarnApplicationSecurity.md#identity-on-an-insecure-cluster-hadoop_user_name + env.put( + YarnConfigKeys.ENV_HADOOP_USER_NAME, + UserGroupInformation.getCurrentUser().getUserName()); + // set classpath from YARN configuration + Utils.setupYarnClassPath(this.yarnConfiguration, env); + return env; + } +} diff --git a/dinky-client/dinky-client-1.17/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java b/dinky-client/dinky-client-1.17/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java index 4c4994aed8..e3aa224987 100644 --- a/dinky-client/dinky-client-1.17/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java +++ b/dinky-client/dinky-client-1.17/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java @@ -51,9 +51,7 @@ import org.apache.flink.table.operations.ddl.CreateTableOperation; import org.apache.flink.types.Row; -import java.io.File; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; @@ -105,19 +103,6 @@ public static CustomTableEnvironmentImpl create( return new CustomTableEnvironmentImpl(streamTableEnvironment); } - @Override - public void addJar(File... jarPath) { - Configuration configuration = this.getRootConfiguration(); - List jars = configuration.get(PipelineOptions.JARS); - if (jars == null) { - configuration.set( - PipelineOptions.JARS, - Arrays.stream(jarPath).map(File::getAbsolutePath).collect(Collectors.toList())); - } else { - CollUtil.addAll(jars, jarPath); - } - } - @Override public boolean parseAndLoadConfiguration(String statement, Map setMap) { List operations = getParser().parse(statement); diff --git a/dinky-client/dinky-client-1.18/src/main/java/org/apache/flink/yarn/Utils.java b/dinky-client/dinky-client-1.18/src/main/java/org/apache/flink/yarn/Utils.java new file mode 100644 index 0000000000..04a71dece6 --- /dev/null +++ b/dinky-client/dinky-client-1.18/src/main/java/org/apache/flink/yarn/Utils.java @@ -0,0 +1,623 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.flink.yarn; + +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.yarn.YarnConfigKeys.ENV_FLINK_CLASSPATH; +import static org.apache.flink.yarn.YarnConfigKeys.LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigUtils; +import org.apache.flink.runtime.clusterframework.BootstrapTools; +import org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters; +import org.apache.flink.runtime.util.HadoopUtils; +import org.apache.flink.util.StringUtils; +import org.apache.flink.util.function.FunctionWithException; +import org.apache.flink.yarn.configuration.YarnConfigOptions; +import org.apache.flink.yarn.configuration.YarnResourceManagerDriverConfiguration; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.security.Credentials; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.security.token.TokenIdentifier; +import org.apache.hadoop.util.StringInterner; +import org.apache.hadoop.yarn.api.ApplicationConstants; +import org.apache.hadoop.yarn.api.ApplicationConstants.Environment; +import org.apache.hadoop.yarn.api.records.ApplicationAccessType; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.LocalResource; +import org.apache.hadoop.yarn.api.records.LocalResourceType; +import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.URL; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.security.AMRMTokenIdentifier; +import org.apache.hadoop.yarn.util.Records; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.stream.Stream; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import cn.hutool.core.util.StrUtil; + +/** Utility class that provides helper methods to work with Apache Hadoop YARN. */ +public final class Utils { + + private static final Logger LOG = LoggerFactory.getLogger(Utils.class); + + /** KRB5 file name populated in YARN container for secure IT run. */ + public static final String KRB5_FILE_NAME = "krb5.conf"; + + /** Yarn site xml file name populated in YARN container for secure IT run. */ + public static final String YARN_SITE_FILE_NAME = "yarn-site.xml"; + + /** Constant representing a wildcard access control list. */ + private static final String WILDCARD_ACL = "*"; + + /** The prefixes that Flink adds to the YARN config. */ + private static final String[] FLINK_CONFIG_PREFIXES = {"flink.yarn."}; + + @VisibleForTesting + static final String YARN_RM_FAIR_SCHEDULER_CLAZZ = + "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler"; + + @VisibleForTesting + static final String YARN_RM_SLS_FAIR_SCHEDULER_CLAZZ = "org.apache.hadoop.yarn.sls.scheduler.SLSFairScheduler"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_MB_KEY = "yarn.resource-types.memory-mb.increment-allocation"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_MB_LEGACY_KEY = "yarn.scheduler.increment-allocation-mb"; + + private static final int DEFAULT_YARN_RM_INCREMENT_ALLOCATION_MB = 1024; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_VCORES_KEY = "yarn.resource-types.vcores.increment-allocation"; + + @VisibleForTesting + static final String YARN_RM_INCREMENT_ALLOCATION_VCORES_LEGACY_KEY = "yarn.scheduler.increment-allocation-vcores"; + + private static final int DEFAULT_YARN_RM_INCREMENT_ALLOCATION_VCORES = 1; + + public static void setupYarnClassPath(Configuration conf, Map appMasterEnv) { + addToEnvironment(appMasterEnv, Environment.CLASSPATH.name(), appMasterEnv.get(ENV_FLINK_CLASSPATH)); + String[] applicationClassPathEntries = conf.getStrings( + YarnConfiguration.YARN_APPLICATION_CLASSPATH, + Stream.of(YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH) + .map(x -> StrUtil.removeAll(x, "%")) + .map(x -> "$".equals(StrUtil.subPre(x, 1)) ? x : "$" + x) + .toArray(String[]::new)); + for (String c : applicationClassPathEntries) { + addToEnvironment(appMasterEnv, Environment.CLASSPATH.name(), c.trim()); + } + } + + /** + * Deletes the YARN application files, e.g., Flink binaries, libraries, etc., from the remote + * filesystem. + * + * @param applicationFilesDir The application files directory. + */ + public static void deleteApplicationFiles(final String applicationFilesDir) { + if (!StringUtils.isNullOrWhitespaceOnly(applicationFilesDir)) { + final org.apache.flink.core.fs.Path path = new org.apache.flink.core.fs.Path(applicationFilesDir); + try { + final org.apache.flink.core.fs.FileSystem fileSystem = path.getFileSystem(); + if (!fileSystem.delete(path, true)) { + LOG.error("Deleting yarn application files under {} was unsuccessful.", applicationFilesDir); + } + } catch (final IOException e) { + LOG.error("Could not properly delete yarn application files directory {}.", applicationFilesDir, e); + } + } else { + LOG.debug("No yarn application files directory set. Therefore, cannot clean up the data."); + } + } + + /** + * Creates a YARN resource for the remote object at the given location. + * + * @param remoteRsrcPath remote location of the resource + * @param resourceSize size of the resource + * @param resourceModificationTime last modification time of the resource + * @return YARN resource + */ + static LocalResource registerLocalResource( + Path remoteRsrcPath, + long resourceSize, + long resourceModificationTime, + LocalResourceVisibility resourceVisibility, + LocalResourceType resourceType) { + LocalResource localResource = Records.newRecord(LocalResource.class); + localResource.setResource(URL.fromURI(remoteRsrcPath.toUri())); + localResource.setSize(resourceSize); + localResource.setTimestamp(resourceModificationTime); + localResource.setType(resourceType); + localResource.setVisibility(resourceVisibility); + return localResource; + } + + /** + * Creates a YARN resource for the remote object at the given location. + * + * @param fs remote filesystem + * @param remoteRsrcPath resource path to be registered + * @return YARN resource + */ + private static LocalResource registerLocalResource( + FileSystem fs, Path remoteRsrcPath, LocalResourceType resourceType) throws IOException { + FileStatus jarStat = fs.getFileStatus(remoteRsrcPath); + return registerLocalResource( + remoteRsrcPath, + jarStat.getLen(), + jarStat.getModificationTime(), + LocalResourceVisibility.APPLICATION, + resourceType); + } + + /** + * Copied method from org.apache.hadoop.yarn.util.Apps. It was broken by YARN-1824 (2.4.0) and + * fixed for 2.4.1 by https://issues.apache.org/jira/browse/YARN-1931 + */ + public static void addToEnvironment(Map environment, String variable, String value) { + String val = environment.get(variable); + if (val == null) { + val = value; + } else { + val = val + YarnClusterDescriptor.pathSeparator + value; + } + environment.put(StringInterner.weakIntern(variable), StringInterner.weakIntern(val)); + } + + /** + * Resolve keytab path either as absolute path or relative to working directory. + * + * @param workingDir current working directory + * @param keytabPath configured keytab path. + * @return resolved keytab path, or null if not found. + */ + public static String resolveKeytabPath(String workingDir, String keytabPath) { + String keytab = null; + if (keytabPath != null) { + File f; + f = new File(keytabPath); + if (f.exists()) { + keytab = f.getAbsolutePath(); + LOG.info("Resolved keytab path: {}", keytab); + } else { + // try using relative paths, this is the case when the keytab was shipped + // as a local resource + f = new File(workingDir, keytabPath); + if (f.exists()) { + keytab = f.getAbsolutePath(); + LOG.info("Resolved keytab path: {}", keytab); + } else { + LOG.warn("Could not resolve keytab path with: {}", keytabPath); + keytab = null; + } + } + } + return keytab; + } + + /** Private constructor to prevent instantiation. */ + private Utils() { + throw new RuntimeException(); + } + + /** + * Creates the launch context, which describes how to bring up a TaskExecutor / TaskManager + * process in an allocated YARN container. + * + *

This code is extremely YARN specific and registers all the resources that the TaskExecutor + * needs (such as JAR file, config file, ...) and all environment variables in a YARN container + * launch context. The launch context then ensures that those resources will be copied into the + * containers transient working directory. + * + * @param flinkConfig The Flink configuration object. + * @param yarnConfig The YARN configuration object. + * @param configuration The YarnResourceManagerDriver configurations. + * @param tmParams The TaskExecutor container memory parameters. + * @param taskManagerDynamicProperties The dynamic configurations to be updated for the + * TaskExecutors based on client uploaded Flink config. + * @param workingDirectory The current application master container's working directory. + * @param taskManagerMainClass The class with the main method. + * @param log The logger. + * @return The launch context for the TaskManager processes. + * @throws Exception Thrown if the launch context could not be created, for example if the + * resources could not be copied. + */ + static ContainerLaunchContext createTaskExecutorContext( + org.apache.flink.configuration.Configuration flinkConfig, + YarnConfiguration yarnConfig, + YarnResourceManagerDriverConfiguration configuration, + ContaineredTaskManagerParameters tmParams, + String taskManagerDynamicProperties, + String workingDirectory, + Class taskManagerMainClass, + Logger log) + throws Exception { + + // get and validate all relevant variables + + String remoteFlinkJarPath = checkNotNull( + configuration.getFlinkDistJar(), "Environment variable %s not set", YarnConfigKeys.FLINK_DIST_JAR); + + String shipListString = checkNotNull( + configuration.getClientShipFiles(), + "Environment variable %s not set", + YarnConfigKeys.ENV_CLIENT_SHIP_FILES); + + final String remoteKeytabPath = configuration.getRemoteKeytabPath(); + final String localKeytabPath = configuration.getLocalKeytabPath(); + final String keytabPrincipal = configuration.getKeytabPrinciple(); + final String remoteYarnConfPath = configuration.getYarnSiteXMLPath(); + final String remoteKrb5Path = configuration.getKrb5Path(); + + if (log.isDebugEnabled()) { + log.debug("TM:remote keytab path obtained {}", remoteKeytabPath); + log.debug("TM:local keytab path obtained {}", localKeytabPath); + log.debug("TM:keytab principal obtained {}", keytabPrincipal); + log.debug("TM:remote yarn conf path obtained {}", remoteYarnConfPath); + log.debug("TM:remote krb5 path obtained {}", remoteKrb5Path); + } + + String classPathString = checkNotNull( + configuration.getFlinkClasspath(), + "Environment variable %s not set", + YarnConfigKeys.ENV_FLINK_CLASSPATH); + + // register keytab + LocalResource keytabResource = null; + if (remoteKeytabPath != null) { + log.info("TM:Adding keytab {} to the container local resource bucket", remoteKeytabPath); + Path keytabPath = new Path(remoteKeytabPath); + FileSystem fs = keytabPath.getFileSystem(yarnConfig); + keytabResource = registerLocalResource(fs, keytabPath, LocalResourceType.FILE); + } + + // To support Yarn Secure Integration Test Scenario + LocalResource yarnConfResource = null; + if (remoteYarnConfPath != null) { + log.info("TM:Adding remoteYarnConfPath {} to the container local resource bucket", remoteYarnConfPath); + Path yarnConfPath = new Path(remoteYarnConfPath); + FileSystem fs = yarnConfPath.getFileSystem(yarnConfig); + yarnConfResource = registerLocalResource(fs, yarnConfPath, LocalResourceType.FILE); + } + + // register krb5.conf + LocalResource krb5ConfResource = null; + boolean hasKrb5 = false; + if (remoteKrb5Path != null) { + log.info("Adding remoteKrb5Path {} to the container local resource bucket", remoteKrb5Path); + Path krb5ConfPath = new Path(remoteKrb5Path); + FileSystem fs = krb5ConfPath.getFileSystem(yarnConfig); + krb5ConfResource = registerLocalResource(fs, krb5ConfPath, LocalResourceType.FILE); + hasKrb5 = true; + } + + Map taskManagerLocalResources = new HashMap<>(); + + // register Flink Jar with remote HDFS + final YarnLocalResourceDescriptor flinkDistLocalResourceDesc = + YarnLocalResourceDescriptor.fromString(remoteFlinkJarPath); + taskManagerLocalResources.put( + flinkDistLocalResourceDesc.getResourceKey(), flinkDistLocalResourceDesc.toLocalResource()); + + // To support Yarn Secure Integration Test Scenario + if (yarnConfResource != null) { + taskManagerLocalResources.put(YARN_SITE_FILE_NAME, yarnConfResource); + } + if (krb5ConfResource != null) { + taskManagerLocalResources.put(KRB5_FILE_NAME, krb5ConfResource); + } + if (keytabResource != null) { + taskManagerLocalResources.put(localKeytabPath, keytabResource); + } + + // prepare additional files to be shipped + decodeYarnLocalResourceDescriptorListFromString(shipListString) + .forEach(resourceDesc -> + taskManagerLocalResources.put(resourceDesc.getResourceKey(), resourceDesc.toLocalResource())); + + // now that all resources are prepared, we can create the launch context + + log.info("Creating container launch context for TaskManagers"); + + boolean hasLogback = new File(workingDirectory, "logback.xml").exists(); + boolean hasLog4j = new File(workingDirectory, "log4j.properties").exists(); + + String launchCommand = BootstrapTools.getTaskManagerShellCommand( + flinkConfig, + tmParams, + ".", + ApplicationConstants.LOG_DIR_EXPANSION_VAR, + hasLogback, + hasLog4j, + hasKrb5, + taskManagerMainClass, + taskManagerDynamicProperties); + + if (log.isDebugEnabled()) { + log.debug("Starting TaskManagers with command: " + launchCommand); + } else { + log.info("Starting TaskManagers"); + } + + ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class); + ctx.setCommands(Collections.singletonList(launchCommand)); + ctx.setLocalResources(taskManagerLocalResources); + + Map containerEnv = new HashMap<>(); + containerEnv.putAll(tmParams.taskManagerEnv()); + + // add YARN classpath, etc to the container environment + containerEnv.put(ENV_FLINK_CLASSPATH, classPathString); + setupYarnClassPath(yarnConfig, containerEnv); + + containerEnv.put( + YarnConfigKeys.ENV_HADOOP_USER_NAME, + UserGroupInformation.getCurrentUser().getUserName()); + + if (remoteKeytabPath != null && localKeytabPath != null && keytabPrincipal != null) { + containerEnv.put(YarnConfigKeys.REMOTE_KEYTAB_PATH, remoteKeytabPath); + containerEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localKeytabPath); + containerEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, keytabPrincipal); + } else if (localKeytabPath != null && keytabPrincipal != null) { + containerEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localKeytabPath); + containerEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, keytabPrincipal); + } + + ctx.setEnvironment(containerEnv); + + setAclsFor(ctx, flinkConfig); + + // For TaskManager YARN container context, read the tokens from the jobmanager yarn + // container local file. + // NOTE: must read the tokens from the local file, not from the UGI context, because if UGI + // is login + // using Kerberos keytabs, there is no HDFS delegation token in the UGI context. + final String fileLocation = System.getenv(UserGroupInformation.HADOOP_TOKEN_FILE_LOCATION); + + if (fileLocation != null) { + log.debug("Adding security tokens to TaskExecutor's container launch context."); + + try (DataOutputBuffer dob = new DataOutputBuffer()) { + Credentials cred = Credentials.readTokenStorageFile( + new File(fileLocation), HadoopUtils.getHadoopConfiguration(flinkConfig)); + + // Filter out AMRMToken before setting the tokens to the TaskManager container + // context. + Credentials taskManagerCred = new Credentials(); + Collection> userTokens = cred.getAllTokens(); + for (Token token : userTokens) { + if (!token.getKind().equals(AMRMTokenIdentifier.KIND_NAME)) { + taskManagerCred.addToken(token.getService(), token); + } + } + + taskManagerCred.writeTokenStorageToStream(dob); + ByteBuffer securityTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); + ctx.setTokens(securityTokens); + } catch (Throwable t) { + log.error("Failed to add Hadoop's security tokens.", t); + } + } else { + log.info("Could not set security tokens because Hadoop's token file location is unknown."); + } + + return ctx; + } + + static boolean isRemotePath(String path) throws IOException { + org.apache.flink.core.fs.Path flinkPath = new org.apache.flink.core.fs.Path(path); + return flinkPath.getFileSystem().isDistributedFS(); + } + + private static List decodeYarnLocalResourceDescriptorListFromString(String resources) + throws Exception { + final List resourceDescriptors = new ArrayList<>(); + for (String shipResourceDescStr : resources.split(LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR)) { + if (!shipResourceDescStr.isEmpty()) { + resourceDescriptors.add(YarnLocalResourceDescriptor.fromString(shipResourceDescStr)); + } + } + return resourceDescriptors; + } + + @VisibleForTesting + static Resource getUnitResource(YarnConfiguration yarnConfig) { + final int unitMemMB, unitVcore; + + final String yarnRmSchedulerClazzName = yarnConfig.get(YarnConfiguration.RM_SCHEDULER); + if (Objects.equals(yarnRmSchedulerClazzName, YARN_RM_FAIR_SCHEDULER_CLAZZ) + || Objects.equals(yarnRmSchedulerClazzName, YARN_RM_SLS_FAIR_SCHEDULER_CLAZZ)) { + String propMem = yarnConfig.get(YARN_RM_INCREMENT_ALLOCATION_MB_KEY); + String propVcore = yarnConfig.get(YARN_RM_INCREMENT_ALLOCATION_VCORES_KEY); + + unitMemMB = propMem != null + ? Integer.parseInt(propMem) + : yarnConfig.getInt( + YARN_RM_INCREMENT_ALLOCATION_MB_LEGACY_KEY, DEFAULT_YARN_RM_INCREMENT_ALLOCATION_MB); + unitVcore = propVcore != null + ? Integer.parseInt(propVcore) + : yarnConfig.getInt( + YARN_RM_INCREMENT_ALLOCATION_VCORES_LEGACY_KEY, + DEFAULT_YARN_RM_INCREMENT_ALLOCATION_VCORES); + } else { + unitMemMB = yarnConfig.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB); + unitVcore = yarnConfig.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES); + } + + return Resource.newInstance(unitMemMB, unitVcore); + } + + public static List getQualifiedRemoteProvidedLibDirs( + org.apache.flink.configuration.Configuration configuration, YarnConfiguration yarnConfiguration) + throws IOException { + + return getRemoteSharedLibPaths(configuration, pathStr -> { + final Path path = new Path(pathStr); + return path.getFileSystem(yarnConfiguration).makeQualified(path); + }); + } + + private static List getRemoteSharedLibPaths( + org.apache.flink.configuration.Configuration configuration, + FunctionWithException strToPathMapper) + throws IOException { + + final List providedLibDirs = + ConfigUtils.decodeListFromConfig(configuration, YarnConfigOptions.PROVIDED_LIB_DIRS, strToPathMapper); + + for (Path path : providedLibDirs) { + if (!Utils.isRemotePath(path.toString())) { + throw new IllegalArgumentException("The \"" + + YarnConfigOptions.PROVIDED_LIB_DIRS.key() + + "\" should only contain" + + " dirs accessible from all worker nodes, while the \"" + + path + + "\" is local."); + } + } + return providedLibDirs; + } + + public static boolean isUsrLibDirectory(final FileSystem fileSystem, final Path path) throws IOException { + final FileStatus fileStatus = fileSystem.getFileStatus(path); + // Use the Path obj from fileStatus to get rid of trailing slash + return fileStatus.isDirectory() + && ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR.equals( + fileStatus.getPath().getName()); + } + + public static Optional getQualifiedRemoteProvidedUsrLib( + org.apache.flink.configuration.Configuration configuration, YarnConfiguration yarnConfiguration) + throws IOException, IllegalArgumentException { + String usrlib = configuration.getString(YarnConfigOptions.PROVIDED_USRLIB_DIR); + if (usrlib == null) { + return Optional.empty(); + } + final Path qualifiedUsrLibPath = FileSystem.get(yarnConfiguration).makeQualified(new Path(usrlib)); + checkArgument( + isRemotePath(qualifiedUsrLibPath.toString()), + "The \"%s\" must point to a remote dir " + "which is accessible from all worker nodes.", + YarnConfigOptions.PROVIDED_USRLIB_DIR.key()); + checkArgument( + isUsrLibDirectory(FileSystem.get(yarnConfiguration), qualifiedUsrLibPath), + "The \"%s\" should be named with \"%s\".", + YarnConfigOptions.PROVIDED_USRLIB_DIR.key(), + ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR); + return Optional.of(qualifiedUsrLibPath); + } + + public static YarnConfiguration getYarnAndHadoopConfiguration( + org.apache.flink.configuration.Configuration flinkConfig) { + final YarnConfiguration yarnConfig = getYarnConfiguration(flinkConfig); + yarnConfig.addResource(HadoopUtils.getHadoopConfiguration(flinkConfig)); + + return yarnConfig; + } + + /** + * Add additional config entries from the flink config to the yarn config. + * + * @param flinkConfig The Flink configuration object. + * @return The yarn configuration. + */ + public static YarnConfiguration getYarnConfiguration(org.apache.flink.configuration.Configuration flinkConfig) { + final YarnConfiguration yarnConfig = new YarnConfiguration(); + + for (String key : flinkConfig.keySet()) { + for (String prefix : FLINK_CONFIG_PREFIXES) { + if (key.startsWith(prefix)) { + String newKey = key.substring("flink.".length()); + String value = flinkConfig.getString(key, null); + yarnConfig.set(newKey, value); + LOG.debug("Adding Flink config entry for {} as {}={} to Yarn config", key, newKey, value); + } + } + } + + return yarnConfig; + } + + /** + * Sets the application ACLs for the given ContainerLaunchContext based on the values specified + * in the given Flink configuration. Only ApplicationAccessType.VIEW_APP and + * ApplicationAccessType.MODIFY_APP ACLs are set, and only if they are configured in the Flink + * configuration. If the viewAcls or modifyAcls string contains the WILDCARD_ACL constant, it + * will replace the entire string with the WILDCARD_ACL. The resulting map is then set as the + * application acls for the given container launch context. + * + * @param amContainer the ContainerLaunchContext to set the ACLs for. + * @param flinkConfig the Flink configuration to read the ACL values from. + */ + public static void setAclsFor( + ContainerLaunchContext amContainer, org.apache.flink.configuration.Configuration flinkConfig) { + Map acls = new HashMap<>(); + final String viewAcls = flinkConfig.getString(YarnConfigOptions.APPLICATION_VIEW_ACLS); + final String modifyAcls = flinkConfig.getString(YarnConfigOptions.APPLICATION_MODIFY_ACLS); + validateAclString(viewAcls); + validateAclString(modifyAcls); + + if (viewAcls != null && !viewAcls.isEmpty()) { + acls.put(ApplicationAccessType.VIEW_APP, viewAcls); + } + if (modifyAcls != null && !modifyAcls.isEmpty()) { + acls.put(ApplicationAccessType.MODIFY_APP, modifyAcls); + } + if (!acls.isEmpty()) { + amContainer.setApplicationACLs(acls); + } + } + + /* Validates the ACL string to ensure that it is either null or the wildcard ACL. */ + private static void validateAclString(String acl) { + if (acl != null && acl.contains("*") && !acl.equals("*")) { + throw new IllegalArgumentException(String.format( + "Invalid wildcard ACL %s. The ACL wildcard does not support regex. The only valid wildcard ACL is '*'.", + acl)); + } + } +} diff --git a/dinky-client/dinky-client-1.18/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java b/dinky-client/dinky-client-1.18/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java new file mode 100644 index 0000000000..4f01396720 --- /dev/null +++ b/dinky-client/dinky-client-1.18/src/main/java/org/apache/flink/yarn/YarnClusterDescriptor.java @@ -0,0 +1,1769 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.flink.yarn; + +import static org.apache.flink.client.deployment.application.ApplicationConfiguration.APPLICATION_MAIN_CLASS; +import static org.apache.flink.configuration.ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR; +import static org.apache.flink.configuration.ConfigConstants.ENV_FLINK_LIB_DIR; +import static org.apache.flink.configuration.ConfigConstants.ENV_FLINK_OPT_DIR; +import static org.apache.flink.runtime.entrypoint.component.FileJobGraphRetriever.JOB_GRAPH_FILE_PATH; +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.yarn.YarnConfigKeys.ENV_FLINK_CLASSPATH; +import static org.apache.flink.yarn.YarnConfigKeys.LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.cache.DistributedCache; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.client.deployment.ClusterDeploymentException; +import org.apache.flink.client.deployment.ClusterDescriptor; +import org.apache.flink.client.deployment.ClusterRetrieveException; +import org.apache.flink.client.deployment.ClusterSpecification; +import org.apache.flink.client.deployment.application.ApplicationConfiguration; +import org.apache.flink.client.program.ClusterClientProvider; +import org.apache.flink.client.program.PackagedProgramUtils; +import org.apache.flink.client.program.rest.RestClusterClient; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigUtils; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ConfigurationUtils; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.configuration.HighAvailabilityOptions; +import org.apache.flink.configuration.IllegalConfigurationException; +import org.apache.flink.configuration.JobManagerOptions; +import org.apache.flink.configuration.MemorySize; +import org.apache.flink.configuration.PipelineOptions; +import org.apache.flink.configuration.ResourceManagerOptions; +import org.apache.flink.configuration.RestOptions; +import org.apache.flink.configuration.SecurityOptions; +import org.apache.flink.configuration.TaskManagerOptions; +import org.apache.flink.core.plugin.PluginConfig; +import org.apache.flink.core.plugin.PluginUtils; +import org.apache.flink.runtime.clusterframework.BootstrapTools; +import org.apache.flink.runtime.entrypoint.ClusterEntrypoint; +import org.apache.flink.runtime.entrypoint.ClusterEntrypointUtils; +import org.apache.flink.runtime.jobgraph.JobGraph; +import org.apache.flink.runtime.jobmanager.HighAvailabilityMode; +import org.apache.flink.runtime.jobmanager.JobManagerProcessSpec; +import org.apache.flink.runtime.jobmanager.JobManagerProcessUtils; +import org.apache.flink.runtime.security.token.DefaultDelegationTokenManager; +import org.apache.flink.runtime.security.token.DelegationTokenContainer; +import org.apache.flink.runtime.security.token.DelegationTokenManager; +import org.apache.flink.runtime.security.token.hadoop.HadoopDelegationTokenConverter; +import org.apache.flink.runtime.security.token.hadoop.KerberosLoginProvider; +import org.apache.flink.runtime.util.HadoopUtils; +import org.apache.flink.util.CollectionUtil; +import org.apache.flink.util.FlinkException; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.ShutdownHookUtil; +import org.apache.flink.util.StringUtils; +import org.apache.flink.yarn.configuration.YarnConfigOptions; +import org.apache.flink.yarn.configuration.YarnConfigOptionsInternal; +import org.apache.flink.yarn.configuration.YarnDeploymentTarget; +import org.apache.flink.yarn.configuration.YarnLogConfigUtil; +import org.apache.flink.yarn.entrypoint.YarnApplicationClusterEntryPoint; +import org.apache.flink.yarn.entrypoint.YarnJobClusterEntrypoint; +import org.apache.flink.yarn.entrypoint.YarnSessionClusterEntrypoint; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.security.Credentials; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.security.token.TokenIdentifier; +import org.apache.hadoop.yarn.api.ApplicationConstants; +import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.ApplicationReport; +import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; +import org.apache.hadoop.yarn.api.records.LocalResourceType; +import org.apache.hadoop.yarn.api.records.NodeReport; +import org.apache.hadoop.yarn.api.records.NodeState; +import org.apache.hadoop.yarn.api.records.Priority; +import org.apache.hadoop.yarn.api.records.QueueInfo; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.YarnApplicationState; +import org.apache.hadoop.yarn.api.records.YarnClusterMetrics; +import org.apache.hadoop.yarn.client.api.YarnClient; +import org.apache.hadoop.yarn.client.api.YarnClientApplication; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.util.ConverterUtils; +import org.apache.hadoop.yarn.util.Records; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.PrintStream; +import java.io.UnsupportedEncodingException; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.net.URI; +import java.net.URLDecoder; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** The descriptor with deployment information for deploying a Flink cluster on Yarn. */ +public class YarnClusterDescriptor implements ClusterDescriptor { + private static final Logger LOG = LoggerFactory.getLogger(YarnClusterDescriptor.class); + public static final String pathSeparator = ":"; + + @VisibleForTesting + static final String IGNORE_UNRECOGNIZED_VM_OPTIONS = "-XX:+IgnoreUnrecognizedVMOptions"; + + private final YarnConfiguration yarnConfiguration; + + private final YarnClient yarnClient; + + private final YarnClusterInformationRetriever yarnClusterInformationRetriever; + + /** True if the descriptor must not shut down the YarnClient. */ + private final boolean sharedYarnClient; + + /** Lazily initialized list of files to ship. */ + private final List shipFiles = new LinkedList<>(); + + private final List shipArchives = new LinkedList<>(); + + private final String yarnQueue; + + private Path flinkJarPath; + + private final Configuration flinkConfiguration; + + private final String customName; + + private final String nodeLabel; + + private final String applicationType; + + private YarnConfigOptions.UserJarInclusion userJarInclusion; + + public YarnClusterDescriptor( + Configuration flinkConfiguration, + YarnConfiguration yarnConfiguration, + YarnClient yarnClient, + YarnClusterInformationRetriever yarnClusterInformationRetriever, + boolean sharedYarnClient) { + + this.yarnConfiguration = Preconditions.checkNotNull(yarnConfiguration); + this.yarnClient = Preconditions.checkNotNull(yarnClient); + this.yarnClusterInformationRetriever = Preconditions.checkNotNull(yarnClusterInformationRetriever); + this.sharedYarnClient = sharedYarnClient; + + this.flinkConfiguration = Preconditions.checkNotNull(flinkConfiguration); + this.userJarInclusion = getUserJarInclusionMode(flinkConfiguration); + + getLocalFlinkDistPath(flinkConfiguration).ifPresent(this::setLocalJarPath); + decodeFilesToShipToCluster(flinkConfiguration, YarnConfigOptions.SHIP_FILES) + .ifPresent(this::addShipFiles); + decodeFilesToShipToCluster(flinkConfiguration, YarnConfigOptions.SHIP_ARCHIVES) + .ifPresent(this::addShipArchives); + + this.yarnQueue = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_QUEUE); + this.customName = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_NAME); + this.applicationType = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_TYPE); + this.nodeLabel = flinkConfiguration.getString(YarnConfigOptions.NODE_LABEL); + } + + private Optional> decodeFilesToShipToCluster( + final Configuration configuration, final ConfigOption> configOption) { + checkNotNull(configuration); + checkNotNull(configOption); + + final List files = ConfigUtils.decodeListFromConfig(configuration, configOption, File::new); + return files.isEmpty() ? Optional.empty() : Optional.of(files); + } + + private Optional getLocalFlinkDistPath(final Configuration configuration) { + final String localJarPath = configuration.getString(YarnConfigOptions.FLINK_DIST_JAR); + if (localJarPath != null) { + return Optional.of(new Path(localJarPath)); + } + + LOG.info("No path for the flink jar passed. Using the location of " + getClass() + " to locate the jar"); + + // check whether it's actually a jar file --> when testing we execute this class without a + // flink-dist jar + final String decodedPath = getDecodedJarPath(); + return decodedPath.endsWith(".jar") ? Optional.of(new Path(new File(decodedPath).toURI())) : Optional.empty(); + } + + private String getDecodedJarPath() { + final String encodedJarPath = YarnClusterClientFactory.class + .getProtectionDomain() + .getCodeSource() + .getLocation() + .getPath(); + try { + return URLDecoder.decode(encodedJarPath, Charset.defaultCharset().name()); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException("Couldn't decode the encoded Flink dist jar path: " + + encodedJarPath + + " You can supply a path manually via the command line."); + } + } + + @VisibleForTesting + List getShipFiles() { + return shipFiles; + } + + public YarnClient getYarnClient() { + return yarnClient; + } + + /** + * The class to start the application master with. This class runs the main method in case of + * session cluster. + */ + protected String getYarnSessionClusterEntrypoint() { + return YarnSessionClusterEntrypoint.class.getName(); + } + + /** + * The class to start the application master with. This class runs the main method in case of + * the job cluster. + */ + protected String getYarnJobClusterEntrypoint() { + return YarnJobClusterEntrypoint.class.getName(); + } + + public Configuration getFlinkConfiguration() { + return flinkConfiguration; + } + + public void setLocalJarPath(Path localJarPath) { + if (!localJarPath.toString().endsWith("jar")) { + throw new IllegalArgumentException( + "The passed jar path ('" + localJarPath + "') does not end with the 'jar' extension"); + } + this.flinkJarPath = localJarPath; + } + + /** + * Adds the given files to the list of files to ship. + * + *

Note that any file matching "flink-dist*.jar" will be excluded from the upload by + * {@link YarnApplicationFileUploader#registerMultipleLocalResources(Collection, String, + * LocalResourceType)} since we upload the Flink uber jar ourselves and do not need to deploy it + * multiple times. + * + * @param shipFiles files to ship + */ + public void addShipFiles(List shipFiles) { + checkArgument( + !isUsrLibDirIncludedInShipFiles(shipFiles), + "User-shipped directories configured via : %s should not include %s.", + YarnConfigOptions.SHIP_FILES.key(), + ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR); + this.shipFiles.addAll(shipFiles); + } + + private void addShipArchives(List shipArchives) { + checkArgument( + isArchiveOnlyIncludedInShipArchiveFiles(shipArchives), + "Directories or non-archive files are included."); + this.shipArchives.addAll(shipArchives); + } + + private static boolean isArchiveOnlyIncludedInShipArchiveFiles(List shipFiles) { + long archivedFileCount = shipFiles.stream() + .filter(File::isFile) + .map(File::getName) + .map(String::toLowerCase) + .filter(name -> name.endsWith(".tar.gz") + || name.endsWith(".tar") + || name.endsWith(".tgz") + || name.endsWith(".dst") + || name.endsWith(".jar") + || name.endsWith(".zip")) + .count(); + return archivedFileCount == shipFiles.size(); + } + + private void isReadyForDeployment(ClusterSpecification clusterSpecification) throws Exception { + + if (this.flinkJarPath == null) { + throw new YarnDeploymentException("The Flink jar path is null"); + } + if (this.flinkConfiguration == null) { + throw new YarnDeploymentException("Flink configuration object has not been set"); + } + + // Check if we don't exceed YARN's maximum virtual cores. + final int numYarnMaxVcores = yarnClusterInformationRetriever.getMaxVcores(); + + int configuredAmVcores = flinkConfiguration.getInteger(YarnConfigOptions.APP_MASTER_VCORES); + if (configuredAmVcores > numYarnMaxVcores) { + throw new IllegalConfigurationException(String.format( + "The number of requested virtual cores for application master %d" + + " exceeds the maximum number of virtual cores %d available in the Yarn Cluster.", + configuredAmVcores, numYarnMaxVcores)); + } + + int configuredVcores = + flinkConfiguration.getInteger(YarnConfigOptions.VCORES, clusterSpecification.getSlotsPerTaskManager()); + // don't configure more than the maximum configured number of vcores + if (configuredVcores > numYarnMaxVcores) { + throw new IllegalConfigurationException(String.format( + "The number of requested virtual cores per node %d" + + " exceeds the maximum number of virtual cores %d available in the Yarn Cluster." + + " Please note that the number of virtual cores is set to the number of task slots by default" + + " unless configured in the Flink config with '%s.'", + configuredVcores, numYarnMaxVcores, YarnConfigOptions.VCORES.key())); + } + + // check if required Hadoop environment variables are set. If not, warn user + if (System.getenv("HADOOP_CONF_DIR") == null && System.getenv("YARN_CONF_DIR") == null) { + LOG.warn("Neither the HADOOP_CONF_DIR nor the YARN_CONF_DIR environment variable is set. " + + "The Flink YARN Client needs one of these to be set to properly load the Hadoop " + + "configuration for accessing YARN."); + } + } + + public String getNodeLabel() { + return nodeLabel; + } + + // ------------------------------------------------------------- + // Lifecycle management + // ------------------------------------------------------------- + + @Override + public void close() { + if (!sharedYarnClient) { + yarnClient.stop(); + } + } + + // ------------------------------------------------------------- + // ClusterClient overrides + // ------------------------------------------------------------- + + @Override + public ClusterClientProvider retrieve(ApplicationId applicationId) throws ClusterRetrieveException { + + try { + // check if required Hadoop environment variables are set. If not, warn user + if (System.getenv("HADOOP_CONF_DIR") == null && System.getenv("YARN_CONF_DIR") == null) { + LOG.warn("Neither the HADOOP_CONF_DIR nor the YARN_CONF_DIR environment variable is set." + + "The Flink YARN Client needs one of these to be set to properly load the Hadoop " + + "configuration for accessing YARN."); + } + + final ApplicationReport report = yarnClient.getApplicationReport(applicationId); + + if (report.getFinalApplicationStatus() != FinalApplicationStatus.UNDEFINED) { + // Flink cluster is not running anymore + LOG.error( + "The application {} doesn't run anymore. It has previously completed with final status: {}", + applicationId, + report.getFinalApplicationStatus()); + throw new RuntimeException("The Yarn application " + applicationId + " doesn't run anymore."); + } + + setClusterEntrypointInfoToConfig(report); + + return () -> { + try { + return new RestClusterClient<>(flinkConfiguration, report.getApplicationId()); + } catch (Exception e) { + throw new RuntimeException("Couldn't retrieve Yarn cluster", e); + } + }; + } catch (Exception e) { + throw new ClusterRetrieveException("Couldn't retrieve Yarn cluster", e); + } + } + + @Override + public ClusterClientProvider deploySessionCluster(ClusterSpecification clusterSpecification) + throws ClusterDeploymentException { + try { + return deployInternal( + clusterSpecification, "Flink session cluster", getYarnSessionClusterEntrypoint(), null, false); + } catch (Exception e) { + throw new ClusterDeploymentException("Couldn't deploy Yarn session cluster", e); + } + } + + @Override + public ClusterClientProvider deployApplicationCluster( + final ClusterSpecification clusterSpecification, final ApplicationConfiguration applicationConfiguration) + throws ClusterDeploymentException { + checkNotNull(clusterSpecification); + checkNotNull(applicationConfiguration); + + final YarnDeploymentTarget deploymentTarget = YarnDeploymentTarget.fromConfig(flinkConfiguration); + if (YarnDeploymentTarget.APPLICATION != deploymentTarget) { + throw new ClusterDeploymentException("Couldn't deploy Yarn Application Cluster." + + " Expected deployment.target=" + + YarnDeploymentTarget.APPLICATION.getName() + + " but actual one was \"" + + deploymentTarget.getName() + + "\""); + } + + applicationConfiguration.applyToConfiguration(flinkConfiguration); + + // No need to do pipelineJars validation if it is a PyFlink job. + if (!(PackagedProgramUtils.isPython(applicationConfiguration.getApplicationClassName()) + || PackagedProgramUtils.isPython(applicationConfiguration.getProgramArguments()))) { + final List pipelineJars = + flinkConfiguration.getOptional(PipelineOptions.JARS).orElse(Collections.emptyList()); + Preconditions.checkArgument(pipelineJars.size() == 1, "Should only have one jar"); + } + + try { + return deployInternal( + clusterSpecification, + "Flink Application Cluster", + YarnApplicationClusterEntryPoint.class.getName(), + null, + false); + } catch (Exception e) { + throw new ClusterDeploymentException("Couldn't deploy Yarn Application Cluster", e); + } + } + + @Override + public ClusterClientProvider deployJobCluster( + ClusterSpecification clusterSpecification, JobGraph jobGraph, boolean detached) + throws ClusterDeploymentException { + + LOG.warn( + "Job Clusters are deprecated since Flink 1.15. Please use an Application Cluster/Application Mode instead."); + try { + return deployInternal( + clusterSpecification, "Flink per-job cluster", getYarnJobClusterEntrypoint(), jobGraph, detached); + } catch (Exception e) { + throw new ClusterDeploymentException("Could not deploy Yarn job cluster.", e); + } + } + + @Override + public void killCluster(ApplicationId applicationId) throws FlinkException { + try { + yarnClient.killApplication(applicationId); + + try (final FileSystem fs = FileSystem.get(yarnConfiguration)) { + final Path applicationDir = + YarnApplicationFileUploader.getApplicationDirPath(getStagingDir(fs), applicationId); + + Utils.deleteApplicationFiles(applicationDir.toUri().toString()); + } + + } catch (YarnException | IOException e) { + throw new FlinkException("Could not kill the Yarn Flink cluster with id " + applicationId + '.', e); + } + } + + /** + * This method will block until the ApplicationMaster/JobManager have been deployed on YARN. + * + * @param clusterSpecification Initial cluster specification for the Flink cluster to be + * deployed + * @param applicationName name of the Yarn application to start + * @param yarnClusterEntrypoint Class name of the Yarn cluster entry point. + * @param jobGraph A job graph which is deployed with the Flink cluster, {@code null} if none + * @param detached True if the cluster should be started in detached mode + */ + private ClusterClientProvider deployInternal( + ClusterSpecification clusterSpecification, + String applicationName, + String yarnClusterEntrypoint, + @Nullable JobGraph jobGraph, + boolean detached) + throws Exception { + + final UserGroupInformation currentUser = UserGroupInformation.getCurrentUser(); + if (HadoopUtils.isKerberosSecurityEnabled(currentUser)) { + boolean useTicketCache = flinkConfiguration.getBoolean(SecurityOptions.KERBEROS_LOGIN_USETICKETCACHE); + + if (!HadoopUtils.areKerberosCredentialsValid(currentUser, useTicketCache)) { + throw new RuntimeException("Hadoop security with Kerberos is enabled but the login user " + + "does not have Kerberos credentials or delegation tokens!"); + } + + final boolean fetchToken = flinkConfiguration.getBoolean(SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN); + final boolean yarnAccessFSEnabled = !CollectionUtil.isNullOrEmpty( + flinkConfiguration.get(SecurityOptions.KERBEROS_HADOOP_FILESYSTEMS_TO_ACCESS)); + if (!fetchToken && yarnAccessFSEnabled) { + throw new IllegalConfigurationException(String.format( + "When %s is disabled, %s must be disabled as well.", + SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN.key(), + SecurityOptions.KERBEROS_HADOOP_FILESYSTEMS_TO_ACCESS.key())); + } + } + + isReadyForDeployment(clusterSpecification); + + // ------------------ Check if the specified queue exists -------------------- + + checkYarnQueues(yarnClient); + + // ------------------ Check if the YARN ClusterClient has the requested resources + // -------------- + + // Create application via yarnClient + final YarnClientApplication yarnApplication = yarnClient.createApplication(); + final GetNewApplicationResponse appResponse = yarnApplication.getNewApplicationResponse(); + + Resource maxRes = appResponse.getMaximumResourceCapability(); + + final ClusterResourceDescription freeClusterMem; + try { + freeClusterMem = getCurrentFreeClusterResources(yarnClient); + } catch (YarnException | IOException e) { + failSessionDuringDeployment(yarnClient, yarnApplication); + throw new YarnDeploymentException("Could not retrieve information about free cluster resources.", e); + } + + final int yarnMinAllocationMB = yarnConfiguration.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB); + if (yarnMinAllocationMB <= 0) { + throw new YarnDeploymentException("The minimum allocation memory " + + "(" + + yarnMinAllocationMB + + " MB) configured via '" + + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB + + "' should be greater than 0."); + } + + final ClusterSpecification validClusterSpecification; + try { + validClusterSpecification = + validateClusterResources(clusterSpecification, yarnMinAllocationMB, maxRes, freeClusterMem); + } catch (YarnDeploymentException yde) { + failSessionDuringDeployment(yarnClient, yarnApplication); + throw yde; + } + + LOG.info("Cluster specification: {}", validClusterSpecification); + + final ClusterEntrypoint.ExecutionMode executionMode = + detached ? ClusterEntrypoint.ExecutionMode.DETACHED : ClusterEntrypoint.ExecutionMode.NORMAL; + + flinkConfiguration.setString(ClusterEntrypoint.INTERNAL_CLUSTER_EXECUTION_MODE, executionMode.toString()); + + ApplicationReport report = startAppMaster( + flinkConfiguration, + applicationName, + yarnClusterEntrypoint, + jobGraph, + yarnClient, + yarnApplication, + validClusterSpecification); + + // print the application id for user to cancel themselves. + if (detached) { + final ApplicationId yarnApplicationId = report.getApplicationId(); + logDetachedClusterInformation(yarnApplicationId, LOG); + } + + setClusterEntrypointInfoToConfig(report); + + return () -> { + try { + return new RestClusterClient<>(flinkConfiguration, report.getApplicationId()); + } catch (Exception e) { + throw new RuntimeException("Error while creating RestClusterClient.", e); + } + }; + } + + private ClusterSpecification validateClusterResources( + ClusterSpecification clusterSpecification, + int yarnMinAllocationMB, + Resource maximumResourceCapability, + ClusterResourceDescription freeClusterResources) + throws YarnDeploymentException { + + int jobManagerMemoryMb = clusterSpecification.getMasterMemoryMB(); + final int taskManagerMemoryMb = clusterSpecification.getTaskManagerMemoryMB(); + + logIfComponentMemNotIntegerMultipleOfYarnMinAllocation("JobManager", jobManagerMemoryMb, yarnMinAllocationMB); + logIfComponentMemNotIntegerMultipleOfYarnMinAllocation("TaskManager", taskManagerMemoryMb, yarnMinAllocationMB); + + // set the memory to minAllocationMB to do the next checks correctly + if (jobManagerMemoryMb < yarnMinAllocationMB) { + jobManagerMemoryMb = yarnMinAllocationMB; + } + + final String note = + "Please check the 'yarn.scheduler.maximum-allocation-mb' and the 'yarn.nodemanager.resource.memory-mb' configuration values\n"; + if (jobManagerMemoryMb > maximumResourceCapability.getMemorySize()) { + throw new YarnDeploymentException( + "The cluster does not have the requested resources for the JobManager available!\n" + + "Maximum Memory: " + + maximumResourceCapability.getMemorySize() + + "MB Requested: " + + jobManagerMemoryMb + + "MB. " + + note); + } + + if (taskManagerMemoryMb > maximumResourceCapability.getMemorySize()) { + throw new YarnDeploymentException( + "The cluster does not have the requested resources for the TaskManagers available!\n" + + "Maximum Memory: " + + maximumResourceCapability.getMemorySize() + + " Requested: " + + taskManagerMemoryMb + + "MB. " + + note); + } + + final String noteRsc = + "\nThe Flink YARN client will try to allocate the YARN session, but maybe not all TaskManagers are " + + "connecting from the beginning because the resources are currently not available in the cluster. " + + "The allocation might take more time than usual because the Flink YARN client needs to wait until " + + "the resources become available."; + + if (taskManagerMemoryMb > freeClusterResources.containerLimit) { + LOG.warn("The requested amount of memory for the TaskManagers (" + + taskManagerMemoryMb + + "MB) is more than " + + "the largest possible YARN container: " + + freeClusterResources.containerLimit + + noteRsc); + } + if (jobManagerMemoryMb > freeClusterResources.containerLimit) { + LOG.warn("The requested amount of memory for the JobManager (" + + jobManagerMemoryMb + + "MB) is more than " + + "the largest possible YARN container: " + + freeClusterResources.containerLimit + + noteRsc); + } + + return new ClusterSpecification.ClusterSpecificationBuilder() + .setMasterMemoryMB(jobManagerMemoryMb) + .setTaskManagerMemoryMB(taskManagerMemoryMb) + .setSlotsPerTaskManager(clusterSpecification.getSlotsPerTaskManager()) + .createClusterSpecification(); + } + + private void logIfComponentMemNotIntegerMultipleOfYarnMinAllocation( + String componentName, int componentMemoryMB, int yarnMinAllocationMB) { + int normalizedMemMB = + (componentMemoryMB + (yarnMinAllocationMB - 1)) / yarnMinAllocationMB * yarnMinAllocationMB; + if (normalizedMemMB <= 0) { + normalizedMemMB = yarnMinAllocationMB; + } + if (componentMemoryMB != normalizedMemMB) { + LOG.info( + "The configured {} memory is {} MB. YARN will allocate {} MB to make up an integer multiple of its " + + "minimum allocation memory ({} MB, configured via 'yarn.scheduler.minimum-allocation-mb'). The extra {} MB " + + "may not be used by Flink.", + componentName, + componentMemoryMB, + normalizedMemMB, + yarnMinAllocationMB, + normalizedMemMB - componentMemoryMB); + } + } + + private void checkYarnQueues(YarnClient yarnClient) { + try { + List queues = yarnClient.getAllQueues(); + if (queues.size() > 0 + && this.yarnQueue != null) { // check only if there are queues configured in yarn and for + // this session. + boolean queueFound = false; + for (QueueInfo queue : queues) { + if (queue.getQueueName().equals(this.yarnQueue) + || queue.getQueueName().equals("root." + this.yarnQueue)) { + queueFound = true; + break; + } + } + if (!queueFound) { + String queueNames = StringUtils.toQuotedListString(queues.toArray()); + LOG.warn("The specified queue '" + + this.yarnQueue + + "' does not exist. " + + "Available queues: " + + queueNames); + } + } else { + LOG.debug("The YARN cluster does not have any queues configured"); + } + } catch (Throwable e) { + LOG.warn("Error while getting queue information from YARN: " + e.getMessage()); + if (LOG.isDebugEnabled()) { + LOG.debug("Error details", e); + } + } + } + + private ApplicationReport startAppMaster( + Configuration configuration, + String applicationName, + String yarnClusterEntrypoint, + JobGraph jobGraph, + YarnClient yarnClient, + YarnClientApplication yarnApplication, + ClusterSpecification clusterSpecification) + throws Exception { + + // ------------------ Initialize the file systems ------------------------- + + org.apache.flink.core.fs.FileSystem.initialize( + configuration, PluginUtils.createPluginManagerFromRootFolder(configuration)); + + final FileSystem fs = FileSystem.get(yarnConfiguration); + + // hard coded check for the GoogleHDFS client because its not overriding the getScheme() + // method. + if (!fs.getClass().getSimpleName().equals("GoogleHadoopFileSystem") + && fs.getScheme().startsWith("file")) { + LOG.warn("The file system scheme is '" + + fs.getScheme() + + "'. This indicates that the " + + "specified Hadoop configuration path is wrong and the system is using the default Hadoop configuration values." + + "The Flink YARN client needs to store its files in a distributed file system"); + } + + ApplicationSubmissionContext appContext = yarnApplication.getApplicationSubmissionContext(); + + final List providedLibDirs = Utils.getQualifiedRemoteProvidedLibDirs(configuration, yarnConfiguration); + + final Optional providedUsrLibDir = + Utils.getQualifiedRemoteProvidedUsrLib(configuration, yarnConfiguration); + + Path stagingDirPath = getStagingDir(fs); + FileSystem stagingDirFs = stagingDirPath.getFileSystem(yarnConfiguration); + final YarnApplicationFileUploader fileUploader = YarnApplicationFileUploader.from( + stagingDirFs, stagingDirPath, providedLibDirs, appContext.getApplicationId(), getFileReplication()); + + // The files need to be shipped and added to classpath. + Set systemShipFiles = CollectionUtil.newHashSetWithExpectedSize(shipFiles.size()); + for (File file : shipFiles) { + systemShipFiles.add(file.getAbsoluteFile()); + } + + final String logConfigFilePath = configuration.getString(YarnConfigOptionsInternal.APPLICATION_LOG_CONFIG_FILE); + if (logConfigFilePath != null) { + systemShipFiles.add(new File(logConfigFilePath)); + } + + // Set-up ApplicationSubmissionContext for the application + + final ApplicationId appId = appContext.getApplicationId(); + + // ------------------ Add Zookeeper namespace to local flinkConfiguration ------ + setHAClusterIdIfNotSet(configuration, appId); + + if (HighAvailabilityMode.isHighAvailabilityModeActivated(configuration)) { + // activate re-execution of failed applications + appContext.setMaxAppAttempts(configuration.getInteger( + YarnConfigOptions.APPLICATION_ATTEMPTS.key(), YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)); + + activateHighAvailabilitySupport(appContext); + } else { + // set number of application retries to 1 in the default case + appContext.setMaxAppAttempts(configuration.getInteger(YarnConfigOptions.APPLICATION_ATTEMPTS.key(), 1)); + } + + final Set userJarFiles = new HashSet<>(); + if (jobGraph != null) { + userJarFiles.addAll(jobGraph.getUserJars().stream() + .map(f -> f.toUri()) + .map(Path::new) + .collect(Collectors.toSet())); + } + + final List jarUrls = ConfigUtils.decodeListFromConfig(configuration, PipelineOptions.JARS, URI::create); + if (jarUrls != null && YarnApplicationClusterEntryPoint.class.getName().equals(yarnClusterEntrypoint)) { + userJarFiles.addAll(jarUrls.stream().map(Path::new).collect(Collectors.toSet())); + } + + // only for per job mode + if (jobGraph != null) { + for (Map.Entry entry : + jobGraph.getUserArtifacts().entrySet()) { + // only upload local files + if (!Utils.isRemotePath(entry.getValue().filePath)) { + Path localPath = new Path(entry.getValue().filePath); + Tuple2 remoteFileInfo = fileUploader.uploadLocalFileToRemote(localPath, entry.getKey()); + jobGraph.setUserArtifactRemotePath(entry.getKey(), remoteFileInfo.f0.toString()); + } + } + + jobGraph.writeUserArtifactEntriesToConfiguration(); + } + + if (providedLibDirs == null || providedLibDirs.isEmpty()) { + addLibFoldersToShipFiles(systemShipFiles); + } + + // Register all files in provided lib dirs as local resources with public visibility + // and upload the remaining dependencies as local resources with APPLICATION visibility. + final List systemClassPaths = fileUploader.registerProvidedLocalResources(); + final List uploadedDependencies = fileUploader.registerMultipleLocalResources( + systemShipFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.FILE); + systemClassPaths.addAll(uploadedDependencies); + + // upload and register ship-only files + // Plugin files only need to be shipped and should not be added to classpath. + if (providedLibDirs == null || providedLibDirs.isEmpty()) { + Set shipOnlyFiles = new HashSet<>(); + addPluginsFoldersToShipFiles(shipOnlyFiles); + fileUploader.registerMultipleLocalResources( + shipOnlyFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.FILE); + } + + if (!shipArchives.isEmpty()) { + fileUploader.registerMultipleLocalResources( + shipArchives.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.ARCHIVE); + } + + // only for application mode + // Python jar file only needs to be shipped and should not be added to classpath. + if (YarnApplicationClusterEntryPoint.class.getName().equals(yarnClusterEntrypoint) + && PackagedProgramUtils.isPython(configuration.get(APPLICATION_MAIN_CLASS))) { + fileUploader.registerMultipleLocalResources( + Collections.singletonList( + new Path(PackagedProgramUtils.getPythonJar().toURI())), + ConfigConstants.DEFAULT_FLINK_OPT_DIR, + LocalResourceType.FILE); + } + + // Upload and register user jars + final List userClassPaths = fileUploader.registerMultipleLocalResources( + userJarFiles, + userJarInclusion == YarnConfigOptions.UserJarInclusion.DISABLED + ? ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR + : Path.CUR_DIR, + LocalResourceType.FILE); + + // usrlib in remote will be used first. + if (providedUsrLibDir.isPresent()) { + final List usrLibClassPaths = fileUploader.registerMultipleLocalResources( + Collections.singletonList(providedUsrLibDir.get()), Path.CUR_DIR, LocalResourceType.FILE); + userClassPaths.addAll(usrLibClassPaths); + } else if (ClusterEntrypointUtils.tryFindUserLibDirectory().isPresent()) { + // local usrlib will be automatically shipped if it exists and there is no remote + // usrlib. + final Set usrLibShipFiles = new HashSet<>(); + addUsrLibFolderToShipFiles(usrLibShipFiles); + final List usrLibClassPaths = fileUploader.registerMultipleLocalResources( + usrLibShipFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), + Path.CUR_DIR, + LocalResourceType.FILE); + userClassPaths.addAll(usrLibClassPaths); + } + + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.ORDER) { + systemClassPaths.addAll(userClassPaths); + } + + // normalize classpath by sorting + Collections.sort(systemClassPaths); + Collections.sort(userClassPaths); + + // classpath assembler + StringBuilder classPathBuilder = new StringBuilder(); + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.FIRST) { + for (String userClassPath : userClassPaths) { + classPathBuilder.append(userClassPath).append(pathSeparator); + } + } + for (String classPath : systemClassPaths) { + classPathBuilder.append(classPath).append(pathSeparator); + } + + // Setup jar for ApplicationMaster + final YarnLocalResourceDescriptor localResourceDescFlinkJar = fileUploader.uploadFlinkDist(flinkJarPath); + classPathBuilder.append(localResourceDescFlinkJar.getResourceKey()).append(pathSeparator); + + // write job graph to tmp file and add it to local resource + // TODO: server use user main method to generate job graph + if (jobGraph != null) { + File tmpJobGraphFile = null; + try { + tmpJobGraphFile = File.createTempFile(appId.toString(), null); + try (FileOutputStream output = new FileOutputStream(tmpJobGraphFile); + ObjectOutputStream obOutput = new ObjectOutputStream(output)) { + obOutput.writeObject(jobGraph); + } + + final String jobGraphFilename = "job.graph"; + configuration.setString(JOB_GRAPH_FILE_PATH, jobGraphFilename); + + fileUploader.registerSingleLocalResource( + jobGraphFilename, new Path(tmpJobGraphFile.toURI()), "", LocalResourceType.FILE, true, false); + classPathBuilder.append(jobGraphFilename).append(pathSeparator); + } catch (Exception e) { + LOG.warn("Add job graph to local resource fail."); + throw e; + } finally { + if (tmpJobGraphFile != null && !tmpJobGraphFile.delete()) { + LOG.warn("Fail to delete temporary file {}.", tmpJobGraphFile.toPath()); + } + } + } + + // Upload the flink configuration + // write out configuration file + File tmpConfigurationFile = null; + try { + tmpConfigurationFile = File.createTempFile(appId + "-flink-conf.yaml", null); + + // remove localhost bind hosts as they render production clusters unusable + removeLocalhostBindHostSetting(configuration, JobManagerOptions.BIND_HOST); + removeLocalhostBindHostSetting(configuration, TaskManagerOptions.BIND_HOST); + // this setting is unconditionally overridden anyway, so we remove it for clarity + configuration.removeConfig(TaskManagerOptions.HOST); + + BootstrapTools.writeConfiguration(configuration, tmpConfigurationFile); + + String flinkConfigKey = "flink-conf.yaml"; + fileUploader.registerSingleLocalResource( + flinkConfigKey, + new Path(tmpConfigurationFile.getAbsolutePath()), + "", + LocalResourceType.FILE, + true, + true); + classPathBuilder.append("flink-conf.yaml").append(pathSeparator); + } finally { + if (tmpConfigurationFile != null && !tmpConfigurationFile.delete()) { + LOG.warn("Fail to delete temporary file {}.", tmpConfigurationFile.toPath()); + } + } + + if (userJarInclusion == YarnConfigOptions.UserJarInclusion.LAST) { + for (String userClassPath : userClassPaths) { + classPathBuilder.append(userClassPath).append(pathSeparator); + } + } + + // To support Yarn Secure Integration Test Scenario + // In Integration test setup, the Yarn containers created by YarnMiniCluster does not have + // the Yarn site XML + // and KRB5 configuration files. We are adding these files as container local resources for + // the container + // applications (JM/TMs) to have proper secure cluster setup + Path remoteYarnSiteXmlPath = null; + if (System.getenv("IN_TESTS") != null) { + File f = new File(System.getenv("YARN_CONF_DIR"), Utils.YARN_SITE_FILE_NAME); + LOG.info("Adding Yarn configuration {} to the AM container local resource bucket", f.getAbsolutePath()); + Path yarnSitePath = new Path(f.getAbsolutePath()); + remoteYarnSiteXmlPath = fileUploader + .registerSingleLocalResource( + Utils.YARN_SITE_FILE_NAME, yarnSitePath, "", LocalResourceType.FILE, false, false) + .getPath(); + if (System.getProperty("java.security.krb5.conf") != null) { + configuration.set(SecurityOptions.KERBEROS_KRB5_PATH, System.getProperty("java.security.krb5.conf")); + } + } + + Path remoteKrb5Path = null; + boolean hasKrb5 = false; + String krb5Config = configuration.get(SecurityOptions.KERBEROS_KRB5_PATH); + if (!StringUtils.isNullOrWhitespaceOnly(krb5Config)) { + final File krb5 = new File(krb5Config); + LOG.info("Adding KRB5 configuration {} to the AM container local resource bucket", krb5.getAbsolutePath()); + final Path krb5ConfPath = new Path(krb5.getAbsolutePath()); + remoteKrb5Path = fileUploader + .registerSingleLocalResource( + Utils.KRB5_FILE_NAME, krb5ConfPath, "", LocalResourceType.FILE, false, false) + .getPath(); + hasKrb5 = true; + } + + Path remotePathKeytab = null; + String localizedKeytabPath = null; + String keytab = configuration.getString(SecurityOptions.KERBEROS_LOGIN_KEYTAB); + if (keytab != null) { + boolean localizeKeytab = flinkConfiguration.getBoolean(YarnConfigOptions.SHIP_LOCAL_KEYTAB); + localizedKeytabPath = flinkConfiguration.getString(YarnConfigOptions.LOCALIZED_KEYTAB_PATH); + if (localizeKeytab) { + // Localize the keytab to YARN containers via local resource. + LOG.info("Adding keytab {} to the AM container local resource bucket", keytab); + remotePathKeytab = fileUploader + .registerSingleLocalResource( + localizedKeytabPath, new Path(keytab), "", LocalResourceType.FILE, false, false) + .getPath(); + } else { + // // Assume Keytab is pre-installed in the container. + localizedKeytabPath = flinkConfiguration.getString(YarnConfigOptions.LOCALIZED_KEYTAB_PATH); + } + } + + final JobManagerProcessSpec processSpec = + JobManagerProcessUtils.processSpecFromConfigWithNewOptionToInterpretLegacyHeap( + flinkConfiguration, JobManagerOptions.TOTAL_PROCESS_MEMORY); + final ContainerLaunchContext amContainer = + setupApplicationMasterContainer(yarnClusterEntrypoint, hasKrb5, processSpec); + + boolean fetchToken = configuration.getBoolean(SecurityOptions.DELEGATION_TOKENS_ENABLED); + KerberosLoginProvider kerberosLoginProvider = new KerberosLoginProvider(configuration); + if (kerberosLoginProvider.isLoginPossible(true)) { + setTokensFor(amContainer, fetchToken); + } else { + LOG.info("Cannot use kerberos delegation token manager, no valid kerberos credentials provided."); + } + + amContainer.setLocalResources(fileUploader.getRegisteredLocalResources()); + fileUploader.close(); + + Utils.setAclsFor(amContainer, flinkConfiguration); + + // Setup CLASSPATH and environment variables for ApplicationMaster + final Map appMasterEnv = generateApplicationMasterEnv( + fileUploader, classPathBuilder.toString(), localResourceDescFlinkJar.toString(), appId.toString()); + + if (localizedKeytabPath != null) { + appMasterEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localizedKeytabPath); + String principal = configuration.getString(SecurityOptions.KERBEROS_LOGIN_PRINCIPAL); + appMasterEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, principal); + if (remotePathKeytab != null) { + appMasterEnv.put(YarnConfigKeys.REMOTE_KEYTAB_PATH, remotePathKeytab.toString()); + } + } + + // To support Yarn Secure Integration Test Scenario + if (remoteYarnSiteXmlPath != null) { + appMasterEnv.put(YarnConfigKeys.ENV_YARN_SITE_XML_PATH, remoteYarnSiteXmlPath.toString()); + } + if (remoteKrb5Path != null) { + appMasterEnv.put(YarnConfigKeys.ENV_KRB5_PATH, remoteKrb5Path.toString()); + } + + amContainer.setEnvironment(appMasterEnv); + + // Set up resource type requirements for ApplicationMaster + Resource capability = Records.newRecord(Resource.class); + capability.setMemorySize(clusterSpecification.getMasterMemoryMB()); + capability.setVirtualCores(flinkConfiguration.getInteger(YarnConfigOptions.APP_MASTER_VCORES)); + + final String customApplicationName = customName != null ? customName : applicationName; + + appContext.setApplicationName(customApplicationName); + appContext.setApplicationType(applicationType != null ? applicationType : "Apache Flink"); + appContext.setAMContainerSpec(amContainer); + appContext.setResource(capability); + + // Set priority for application + int priorityNum = flinkConfiguration.getInteger(YarnConfigOptions.APPLICATION_PRIORITY); + if (priorityNum >= 0) { + Priority priority = Priority.newInstance(priorityNum); + appContext.setPriority(priority); + } + + if (yarnQueue != null) { + appContext.setQueue(yarnQueue); + } + + setApplicationNodeLabel(appContext); + + setApplicationTags(appContext); + + // add a hook to clean up in case deployment fails + Thread deploymentFailureHook = new DeploymentFailureHook(yarnApplication, fileUploader.getApplicationDir()); + Runtime.getRuntime().addShutdownHook(deploymentFailureHook); + LOG.info("Submitting application master " + appId); + yarnClient.submitApplication(appContext); + + LOG.info("Waiting for the cluster to be allocated"); + final long startTime = System.currentTimeMillis(); + long lastLogTime = System.currentTimeMillis(); + ApplicationReport report; + YarnApplicationState lastAppState = YarnApplicationState.NEW; + loop: + while (true) { + try { + report = yarnClient.getApplicationReport(appId); + } catch (IOException e) { + throw new YarnDeploymentException("Failed to deploy the cluster.", e); + } + YarnApplicationState appState = report.getYarnApplicationState(); + LOG.debug("Application State: {}", appState); + switch (appState) { + case FAILED: + case KILLED: + throw new YarnDeploymentException("The YARN application unexpectedly switched to state " + + appState + + " during deployment. \n" + + "Diagnostics from YARN: " + + report.getDiagnostics() + + "\n" + + "If log aggregation is enabled on your cluster, use this command to further investigate the issue:\n" + + "yarn logs -applicationId " + + appId); + // break .. + case RUNNING: + LOG.info("YARN application has been deployed successfully."); + break loop; + case FINISHED: + LOG.info("YARN application has been finished successfully."); + break loop; + default: + if (appState != lastAppState) { + LOG.info("Deploying cluster, current state " + appState); + } + if (System.currentTimeMillis() - lastLogTime > 60000) { + lastLogTime = System.currentTimeMillis(); + LOG.info( + "Deployment took more than {} seconds. Please check if the requested resources are available in the YARN cluster", + (lastLogTime - startTime) / 1000); + } + } + lastAppState = appState; + Thread.sleep(250); + } + + // since deployment was successful, remove the hook + ShutdownHookUtil.removeShutdownHook(deploymentFailureHook, getClass().getSimpleName(), LOG); + return report; + } + + private void removeLocalhostBindHostSetting(Configuration configuration, ConfigOption option) { + configuration + .getOptional(option) + .filter(bindHost -> bindHost.equals("localhost")) + .ifPresent(bindHost -> { + LOG.info( + "Removing 'localhost' {} setting from effective configuration; using '0.0.0.0' instead.", + option); + configuration.removeConfig(option); + }); + } + + private void setTokensFor(ContainerLaunchContext containerLaunchContext, boolean fetchToken) throws Exception { + Credentials credentials = new Credentials(); + + LOG.info("Loading delegation tokens available locally to add to the AM container"); + // for user + UserGroupInformation currUsr = UserGroupInformation.getCurrentUser(); + + Collection> usrTok = + currUsr.getCredentials().getAllTokens(); + for (Token token : usrTok) { + LOG.info("Adding user token " + token.getService() + " with " + token); + credentials.addToken(token.getService(), token); + } + + if (fetchToken) { + LOG.info("Fetching delegation tokens to add to the AM container."); + DelegationTokenManager delegationTokenManager = + new DefaultDelegationTokenManager(flinkConfiguration, null, null, null); + DelegationTokenContainer container = new DelegationTokenContainer(); + delegationTokenManager.obtainDelegationTokens(container); + + // This is here for backward compatibility to make log aggregation work + for (Map.Entry e : container.getTokens().entrySet()) { + if (e.getKey().equals("hadoopfs")) { + credentials.addAll(HadoopDelegationTokenConverter.deserialize(e.getValue())); + } + } + } + + ByteBuffer tokens = ByteBuffer.wrap(HadoopDelegationTokenConverter.serialize(credentials)); + containerLaunchContext.setTokens(tokens); + + LOG.info("Delegation tokens added to the AM container."); + } + + /** + * Returns the configured remote target home directory if set, otherwise returns the default + * home directory. + * + * @param defaultFileSystem default file system used + * @return the remote target home directory + */ + @VisibleForTesting + Path getStagingDir(FileSystem defaultFileSystem) throws IOException { + final String configuredStagingDir = flinkConfiguration.getString(YarnConfigOptions.STAGING_DIRECTORY); + if (configuredStagingDir == null) { + return defaultFileSystem.getHomeDirectory(); + } + FileSystem stagingDirFs = new Path(configuredStagingDir).getFileSystem(defaultFileSystem.getConf()); + return stagingDirFs.makeQualified(new Path(configuredStagingDir)); + } + + private int getFileReplication() { + final int yarnFileReplication = + yarnConfiguration.getInt(DFSConfigKeys.DFS_REPLICATION_KEY, DFSConfigKeys.DFS_REPLICATION_DEFAULT); + final int fileReplication = flinkConfiguration.getInteger(YarnConfigOptions.FILE_REPLICATION); + return fileReplication > 0 ? fileReplication : yarnFileReplication; + } + + private static String encodeYarnLocalResourceDescriptorListToString(List resources) { + return String.join( + LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR, + resources.stream().map(YarnLocalResourceDescriptor::toString).collect(Collectors.toList())); + } + + /** + * Kills YARN application and stops YARN client. + * + *

Use this method to kill the App before it has been properly deployed + */ + private void failSessionDuringDeployment(YarnClient yarnClient, YarnClientApplication yarnApplication) { + LOG.info("Killing YARN application"); + + try { + yarnClient.killApplication( + yarnApplication.getNewApplicationResponse().getApplicationId()); + } catch (Exception e) { + // we only log a debug message here because the "killApplication" call is a best-effort + // call (we don't know if the application has been deployed when the error occurred). + LOG.debug("Error while killing YARN application", e); + } + } + + private static class ClusterResourceDescription { + public final long totalFreeMemory; + public final long containerLimit; + public final long[] nodeManagersFree; + + public ClusterResourceDescription(long totalFreeMemory, long containerLimit, long[] nodeManagersFree) { + this.totalFreeMemory = totalFreeMemory; + this.containerLimit = containerLimit; + this.nodeManagersFree = nodeManagersFree; + } + } + + private ClusterResourceDescription getCurrentFreeClusterResources(YarnClient yarnClient) + throws YarnException, IOException { + List nodes = yarnClient.getNodeReports(NodeState.RUNNING); + + int totalFreeMemory = 0; + long containerLimit = 0; + long[] nodeManagersFree = new long[nodes.size()]; + + for (int i = 0; i < nodes.size(); i++) { + NodeReport rep = nodes.get(i); + long free = rep.getCapability().getMemorySize() + - (rep.getUsed() != null ? rep.getUsed().getMemorySize() : 0); + nodeManagersFree[i] = free; + totalFreeMemory += free; + if (free > containerLimit) { + containerLimit = free; + } + } + return new ClusterResourceDescription(totalFreeMemory, containerLimit, nodeManagersFree); + } + + @Override + public String getClusterDescription() { + + try { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + PrintStream ps = new PrintStream(baos); + + YarnClusterMetrics metrics = yarnClient.getYarnClusterMetrics(); + + ps.append("NodeManagers in the ClusterClient " + metrics.getNumNodeManagers()); + List nodes = yarnClient.getNodeReports(NodeState.RUNNING); + final String format = "|%-16s |%-16s %n"; + ps.printf("|Property |Value %n"); + ps.println("+---------------------------------------+"); + long totalMemory = 0; + int totalCores = 0; + for (NodeReport rep : nodes) { + final Resource res = rep.getCapability(); + totalMemory += res.getMemorySize(); + totalCores += res.getVirtualCores(); + ps.format(format, "NodeID", rep.getNodeId()); + ps.format(format, "Memory", getDisplayMemory(res.getMemorySize())); + ps.format(format, "vCores", res.getVirtualCores()); + ps.format(format, "HealthReport", rep.getHealthReport()); + ps.format(format, "Containers", rep.getNumContainers()); + ps.println("+---------------------------------------+"); + } + ps.println("Summary: totalMemory " + getDisplayMemory(totalMemory) + " totalCores " + totalCores); + List qInfo = yarnClient.getAllQueues(); + for (QueueInfo q : qInfo) { + ps.println("Queue: " + + q.getQueueName() + + ", Current Capacity: " + + q.getCurrentCapacity() + + " Max Capacity: " + + q.getMaximumCapacity() + + " Applications: " + + q.getApplications().size()); + } + return baos.toString(); + } catch (Exception e) { + throw new RuntimeException("Couldn't get cluster description", e); + } + } + + private void activateHighAvailabilitySupport(ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + + reflector.setKeepContainersAcrossApplicationAttempts(appContext, true); + + reflector.setAttemptFailuresValidityInterval( + appContext, + flinkConfiguration.getLong(YarnConfigOptions.APPLICATION_ATTEMPT_FAILURE_VALIDITY_INTERVAL)); + } + + private void setApplicationTags(final ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + final ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + final String tagsString = flinkConfiguration.getString(YarnConfigOptions.APPLICATION_TAGS); + + final Set applicationTags = new HashSet<>(); + + // Trim whitespace and cull empty tags + for (final String tag : tagsString.split(",")) { + final String trimmedTag = tag.trim(); + if (!trimmedTag.isEmpty()) { + applicationTags.add(trimmedTag); + } + } + + reflector.setApplicationTags(appContext, applicationTags); + } + + private void setApplicationNodeLabel(final ApplicationSubmissionContext appContext) + throws InvocationTargetException, IllegalAccessException { + + if (nodeLabel != null) { + final ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance(); + reflector.setApplicationNodeLabel(appContext, nodeLabel); + } + } + + /** + * Singleton object which uses reflection to determine whether the {@link + * ApplicationSubmissionContext} supports various methods which, depending on the Hadoop + * version, may or may not be supported. + * + *

If an unsupported method is invoked, nothing happens. + * + *

Currently three methods are proxied: - setApplicationTags (>= 2.4.0) - + * setAttemptFailuresValidityInterval (>= 2.6.0) - setKeepContainersAcrossApplicationAttempts + * (>= 2.4.0) - setNodeLabelExpression (>= 2.6.0) + */ + private static class ApplicationSubmissionContextReflector { + private static final Logger LOG = LoggerFactory.getLogger(ApplicationSubmissionContextReflector.class); + + private static final ApplicationSubmissionContextReflector instance = + new ApplicationSubmissionContextReflector(ApplicationSubmissionContext.class); + + public static ApplicationSubmissionContextReflector getInstance() { + return instance; + } + + private static final String APPLICATION_TAGS_METHOD_NAME = "setApplicationTags"; + private static final String ATTEMPT_FAILURES_METHOD_NAME = "setAttemptFailuresValidityInterval"; + private static final String KEEP_CONTAINERS_METHOD_NAME = "setKeepContainersAcrossApplicationAttempts"; + private static final String NODE_LABEL_EXPRESSION_NAME = "setNodeLabelExpression"; + + private final Method applicationTagsMethod; + private final Method attemptFailuresValidityIntervalMethod; + private final Method keepContainersMethod; + + @Nullable + private final Method nodeLabelExpressionMethod; + + private ApplicationSubmissionContextReflector(Class clazz) { + Method applicationTagsMethod; + Method attemptFailuresValidityIntervalMethod; + Method keepContainersMethod; + Method nodeLabelExpressionMethod; + + try { + // this method is only supported by Hadoop 2.4.0 onwards + applicationTagsMethod = clazz.getMethod(APPLICATION_TAGS_METHOD_NAME, Set.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), APPLICATION_TAGS_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), APPLICATION_TAGS_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + applicationTagsMethod = null; + } + + this.applicationTagsMethod = applicationTagsMethod; + + try { + // this method is only supported by Hadoop 2.6.0 onwards + attemptFailuresValidityIntervalMethod = clazz.getMethod(ATTEMPT_FAILURES_METHOD_NAME, long.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), ATTEMPT_FAILURES_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), ATTEMPT_FAILURES_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + attemptFailuresValidityIntervalMethod = null; + } + + this.attemptFailuresValidityIntervalMethod = attemptFailuresValidityIntervalMethod; + + try { + // this method is only supported by Hadoop 2.4.0 onwards + keepContainersMethod = clazz.getMethod(KEEP_CONTAINERS_METHOD_NAME, boolean.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), KEEP_CONTAINERS_METHOD_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), KEEP_CONTAINERS_METHOD_NAME); + // assign null because the Hadoop version apparently does not support this call. + keepContainersMethod = null; + } + + this.keepContainersMethod = keepContainersMethod; + + try { + nodeLabelExpressionMethod = clazz.getMethod(NODE_LABEL_EXPRESSION_NAME, String.class); + LOG.debug("{} supports method {}.", clazz.getCanonicalName(), NODE_LABEL_EXPRESSION_NAME); + } catch (NoSuchMethodException e) { + LOG.debug("{} does not support method {}.", clazz.getCanonicalName(), NODE_LABEL_EXPRESSION_NAME); + nodeLabelExpressionMethod = null; + } + + this.nodeLabelExpressionMethod = nodeLabelExpressionMethod; + } + + public void setApplicationTags(ApplicationSubmissionContext appContext, Set applicationTags) + throws InvocationTargetException, IllegalAccessException { + if (applicationTagsMethod != null) { + LOG.debug( + "Calling method {} of {}.", + applicationTagsMethod.getName(), + appContext.getClass().getCanonicalName()); + applicationTagsMethod.invoke(appContext, applicationTags); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + APPLICATION_TAGS_METHOD_NAME); + } + } + + public void setApplicationNodeLabel(ApplicationSubmissionContext appContext, String nodeLabel) + throws InvocationTargetException, IllegalAccessException { + if (nodeLabelExpressionMethod != null) { + LOG.debug( + "Calling method {} of {}.", + nodeLabelExpressionMethod.getName(), + appContext.getClass().getCanonicalName()); + nodeLabelExpressionMethod.invoke(appContext, nodeLabel); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + NODE_LABEL_EXPRESSION_NAME); + } + } + + public void setAttemptFailuresValidityInterval(ApplicationSubmissionContext appContext, long validityInterval) + throws InvocationTargetException, IllegalAccessException { + if (attemptFailuresValidityIntervalMethod != null) { + LOG.debug( + "Calling method {} of {}.", + attemptFailuresValidityIntervalMethod.getName(), + appContext.getClass().getCanonicalName()); + attemptFailuresValidityIntervalMethod.invoke(appContext, validityInterval); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + ATTEMPT_FAILURES_METHOD_NAME); + } + } + + public void setKeepContainersAcrossApplicationAttempts( + ApplicationSubmissionContext appContext, boolean keepContainers) + throws InvocationTargetException, IllegalAccessException { + + if (keepContainersMethod != null) { + LOG.debug( + "Calling method {} of {}.", + keepContainersMethod.getName(), + appContext.getClass().getCanonicalName()); + keepContainersMethod.invoke(appContext, keepContainers); + } else { + LOG.debug( + "{} does not support method {}. Doing nothing.", + appContext.getClass().getCanonicalName(), + KEEP_CONTAINERS_METHOD_NAME); + } + } + } + + private static class YarnDeploymentException extends RuntimeException { + private static final long serialVersionUID = -812040641215388943L; + + public YarnDeploymentException(String message) { + super(message); + } + + public YarnDeploymentException(String message, Throwable cause) { + super(message, cause); + } + } + + private class DeploymentFailureHook extends Thread { + + private final YarnClient yarnClient; + private final YarnClientApplication yarnApplication; + private final Path yarnFilesDir; + + DeploymentFailureHook(YarnClientApplication yarnApplication, Path yarnFilesDir) { + this.yarnApplication = Preconditions.checkNotNull(yarnApplication); + this.yarnFilesDir = Preconditions.checkNotNull(yarnFilesDir); + + // A new yarn client need to be created in shutdown hook in order to avoid + // the yarn client has been closed by YarnClusterDescriptor. + this.yarnClient = YarnClient.createYarnClient(); + this.yarnClient.init(yarnConfiguration); + } + + @Override + public void run() { + LOG.info("Cancelling deployment from Deployment Failure Hook"); + yarnClient.start(); + failSessionDuringDeployment(yarnClient, yarnApplication); + yarnClient.stop(); + LOG.info("Deleting files in {}.", yarnFilesDir); + try { + FileSystem fs = FileSystem.get(yarnConfiguration); + + if (!fs.delete(yarnFilesDir, true)) { + throw new IOException("Deleting files in " + yarnFilesDir + " was unsuccessful"); + } + + fs.close(); + } catch (IOException e) { + LOG.error("Failed to delete Flink Jar and configuration files in HDFS", e); + } + } + } + + @VisibleForTesting + void addLibFoldersToShipFiles(Collection effectiveShipFiles) { + // Add lib folder to the ship files if the environment variable is set. + // This is for convenience when running from the command-line. + // (for other files users explicitly set the ship files) + String libDir = System.getenv().get(ENV_FLINK_LIB_DIR); + if (libDir != null) { + File directoryFile = new File(libDir); + if (directoryFile.isDirectory()) { + effectiveShipFiles.add(directoryFile); + } else { + throw new YarnDeploymentException("The environment variable '" + + ENV_FLINK_LIB_DIR + + "' is set to '" + + libDir + + "' but the directory doesn't exist."); + } + } else if (shipFiles.isEmpty()) { + LOG.warn( + "Environment variable '{}' not set and ship files have not been provided manually. " + + "Not shipping any library files.", + ENV_FLINK_LIB_DIR); + } + } + + @VisibleForTesting + void addUsrLibFolderToShipFiles(Collection effectiveShipFiles) { + // Add usrlib folder to the ship files if it exists + // Classes in the folder will be loaded by UserClassLoader if CLASSPATH_INCLUDE_USER_JAR is + // DISABLED. + ClusterEntrypointUtils.tryFindUserLibDirectory().ifPresent(usrLibDirFile -> { + effectiveShipFiles.add(usrLibDirFile); + LOG.info("usrlib: {} will be shipped automatically.", usrLibDirFile.getAbsolutePath()); + }); + } + + @VisibleForTesting + void addPluginsFoldersToShipFiles(Collection effectiveShipFiles) { + final Optional pluginsDir = PluginConfig.getPluginsDir(); + pluginsDir.ifPresent(effectiveShipFiles::add); + } + + ContainerLaunchContext setupApplicationMasterContainer( + String yarnClusterEntrypoint, boolean hasKrb5, JobManagerProcessSpec processSpec) { + // ------------------ Prepare Application Master Container ------------------------------ + + // respect custom JVM options in the YAML file + String javaOpts = flinkConfiguration.getString(CoreOptions.FLINK_JVM_OPTIONS); + if (flinkConfiguration.getString(CoreOptions.FLINK_JM_JVM_OPTIONS).length() > 0) { + javaOpts += " " + flinkConfiguration.getString(CoreOptions.FLINK_JM_JVM_OPTIONS); + } + + javaOpts += " " + IGNORE_UNRECOGNIZED_VM_OPTIONS; + + // krb5.conf file will be available as local resource in JM/TM container + if (hasKrb5) { + javaOpts += " -Djava.security.krb5.conf=krb5.conf"; + } + + // Set up the container launch context for the application master + ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); + + final Map startCommandValues = new HashMap<>(); + startCommandValues.put("java", "$JAVA_HOME/bin/java"); + + String jvmHeapMem = JobManagerProcessUtils.generateJvmParametersStr(processSpec, flinkConfiguration); + startCommandValues.put("jvmmem", jvmHeapMem); + + startCommandValues.put("jvmopts", javaOpts); + startCommandValues.put("logging", YarnLogConfigUtil.getLoggingYarnCommand(flinkConfiguration)); + + startCommandValues.put("class", yarnClusterEntrypoint); + startCommandValues.put( + "redirects", + "1> " + + ApplicationConstants.LOG_DIR_EXPANSION_VAR + + "/jobmanager.out " + + "2> " + + ApplicationConstants.LOG_DIR_EXPANSION_VAR + + "/jobmanager.err"); + String dynamicParameterListStr = JobManagerProcessUtils.generateDynamicConfigsStr(processSpec); + startCommandValues.put("args", dynamicParameterListStr); + + final String commandTemplate = flinkConfiguration.getString( + ConfigConstants.YARN_CONTAINER_START_COMMAND_TEMPLATE, + ConfigConstants.DEFAULT_YARN_CONTAINER_START_COMMAND_TEMPLATE); + final String amCommand = BootstrapTools.getStartCommand(commandTemplate, startCommandValues); + + amContainer.setCommands(Collections.singletonList(amCommand)); + + LOG.debug("Application Master start command: " + amCommand); + + return amContainer; + } + + private static YarnConfigOptions.UserJarInclusion getUserJarInclusionMode( + org.apache.flink.configuration.Configuration config) { + return config.get(YarnConfigOptions.CLASSPATH_INCLUDE_USER_JAR); + } + + private static boolean isUsrLibDirIncludedInShipFiles(List shipFiles) { + return shipFiles.stream() + .filter(File::isDirectory) + .map(File::getName) + .anyMatch(name -> name.equals(DEFAULT_FLINK_USR_LIB_DIR)); + } + + private void setClusterEntrypointInfoToConfig(final ApplicationReport report) { + checkNotNull(report); + + final ApplicationId appId = report.getApplicationId(); + final String host = report.getHost(); + final int port = report.getRpcPort(); + + LOG.info("Found Web Interface {}:{} of application '{}'.", host, port, appId); + + flinkConfiguration.setString(JobManagerOptions.ADDRESS, host); + flinkConfiguration.setInteger(JobManagerOptions.PORT, port); + + flinkConfiguration.setString(RestOptions.ADDRESS, host); + flinkConfiguration.setInteger(RestOptions.PORT, port); + + flinkConfiguration.set(YarnConfigOptions.APPLICATION_ID, ConverterUtils.toString(appId)); + + setHAClusterIdIfNotSet(flinkConfiguration, appId); + } + + private void setHAClusterIdIfNotSet(Configuration configuration, ApplicationId appId) { + // set cluster-id to app id if not specified + if (!configuration.contains(HighAvailabilityOptions.HA_CLUSTER_ID)) { + configuration.set(HighAvailabilityOptions.HA_CLUSTER_ID, ConverterUtils.toString(appId)); + } + } + + public static void logDetachedClusterInformation(ApplicationId yarnApplicationId, Logger logger) { + logger.info( + "The Flink YARN session cluster has been started in detached mode. In order to " + + "stop Flink gracefully, use the following command:\n" + + "$ echo \"stop\" | ./bin/yarn-session.sh -id {}\n" + + "If this should not be possible, then you can also kill Flink via YARN's web interface or via:\n" + + "$ yarn application -kill {}\n" + + "Note that killing Flink might not clean up all job artifacts and temporary files.", + yarnApplicationId, + yarnApplicationId); + } + + @VisibleForTesting + Map generateApplicationMasterEnv( + final YarnApplicationFileUploader fileUploader, + final String classPathStr, + final String localFlinkJarStr, + final String appIdStr) + throws IOException { + final Map env = new HashMap<>(); + // set user specified app master environment variables + env.putAll(ConfigurationUtils.getPrefixedKeyValuePairs( + ResourceManagerOptions.CONTAINERIZED_MASTER_ENV_PREFIX, this.flinkConfiguration)); + // set Flink app class path + env.put(ENV_FLINK_CLASSPATH, classPathStr); + // Set FLINK_LIB_DIR to `lib` folder under working dir in container + env.put(ENV_FLINK_LIB_DIR, Path.CUR_DIR + "/" + ConfigConstants.DEFAULT_FLINK_LIB_DIR); + // Set FLINK_OPT_DIR to `opt` folder under working dir in container + env.put(ENV_FLINK_OPT_DIR, Path.CUR_DIR + "/" + ConfigConstants.DEFAULT_FLINK_OPT_DIR); + // set Flink on YARN internal configuration values + env.put(YarnConfigKeys.FLINK_DIST_JAR, localFlinkJarStr); + env.put(YarnConfigKeys.ENV_APP_ID, appIdStr); + env.put(YarnConfigKeys.ENV_CLIENT_HOME_DIR, fileUploader.getHomeDir().toString()); + env.put( + YarnConfigKeys.ENV_CLIENT_SHIP_FILES, + encodeYarnLocalResourceDescriptorListToString(fileUploader.getEnvShipResourceList())); + env.put( + YarnConfigKeys.FLINK_YARN_FILES, + fileUploader.getApplicationDir().toUri().toString()); + // https://github.com/apache/hadoop/blob/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/YarnApplicationSecurity.md#identity-on-an-insecure-cluster-hadoop_user_name + env.put( + YarnConfigKeys.ENV_HADOOP_USER_NAME, + UserGroupInformation.getCurrentUser().getUserName()); + // set classpath from YARN configuration + Utils.setupYarnClassPath(this.yarnConfiguration, env); + return env; + } + + private String getDisplayMemory(long memoryMB) { + return MemorySize.ofMebiBytes(memoryMB).toHumanReadableString(); + } +} diff --git a/dinky-client/dinky-client-1.18/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java b/dinky-client/dinky-client-1.18/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java index ff9506611d..eaf8adb816 100644 --- a/dinky-client/dinky-client-1.18/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java +++ b/dinky-client/dinky-client-1.18/src/main/java/org/dinky/executor/CustomTableEnvironmentImpl.java @@ -53,9 +53,7 @@ import org.apache.flink.table.operations.ddl.CreateTableOperation; import org.apache.flink.types.Row; -import java.io.File; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; @@ -107,19 +105,6 @@ public static CustomTableEnvironmentImpl create( return new CustomTableEnvironmentImpl(streamTableEnvironment); } - @Override - public void addJar(File... jarPath) { - Configuration configuration = this.getRootConfiguration(); - List jars = configuration.get(PipelineOptions.JARS); - if (jars == null) { - configuration.set( - PipelineOptions.JARS, - Arrays.stream(jarPath).map(File::getAbsolutePath).collect(Collectors.toList())); - } else { - CollUtil.addAll(jars, jarPath); - } - } - @Override public boolean parseAndLoadConfiguration(String statement, Map setMap) { List operations = getParser().parse(statement); diff --git a/dinky-client/dinky-client-base/src/main/java/org/dinky/executor/CustomTableEnvironment.java b/dinky-client/dinky-client-base/src/main/java/org/dinky/executor/CustomTableEnvironment.java index f2b1e67bdb..7f549241eb 100644 --- a/dinky-client/dinky-client-base/src/main/java/org/dinky/executor/CustomTableEnvironment.java +++ b/dinky-client/dinky-client-base/src/main/java/org/dinky/executor/CustomTableEnvironment.java @@ -23,6 +23,7 @@ import org.dinky.data.result.SqlExplainResult; import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.PipelineOptions; import org.apache.flink.runtime.jobgraph.JobGraph; import org.apache.flink.runtime.rest.messages.JobPlanInfo; import org.apache.flink.streaming.api.datastream.DataStream; @@ -36,12 +37,18 @@ import org.apache.flink.types.Row; import java.io.File; +import java.net.URL; +import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import com.fasterxml.jackson.databind.node.ObjectNode; +import cn.hutool.core.collection.CollUtil; +import cn.hutool.core.util.URLUtil; + /** * CustomTableEnvironment * @@ -76,5 +83,15 @@ default List getLineage(String statement) { void executeCTAS(Operation operation); - void addJar(File... jarPath); + default void addJar(File... jarPath) { + Configuration configuration = this.getRootConfiguration(); + List pathList = + Arrays.stream(URLUtil.getURLs(jarPath)).map(URL::toString).collect(Collectors.toList()); + List jars = configuration.get(PipelineOptions.JARS); + if (jars == null) { + configuration.set(PipelineOptions.JARS, pathList); + } else { + CollUtil.addAll(jars, pathList); + } + } } diff --git a/dinky-client/dinky-client-base/src/main/java/org/dinky/trans/dml/ExecuteJarOperation.java b/dinky-client/dinky-client-base/src/main/java/org/dinky/trans/dml/ExecuteJarOperation.java index 30c92b861b..7847791166 100644 --- a/dinky-client/dinky-client-base/src/main/java/org/dinky/trans/dml/ExecuteJarOperation.java +++ b/dinky-client/dinky-client-base/src/main/java/org/dinky/trans/dml/ExecuteJarOperation.java @@ -24,6 +24,7 @@ import org.dinky.trans.ExtendOperation; import org.dinky.trans.parse.ExecuteJarParseStrategy; import org.dinky.utils.RunTimeUtil; +import org.dinky.utils.URLUtils; import org.apache.flink.api.dag.Pipeline; import org.apache.flink.client.program.PackagedProgram; @@ -37,7 +38,6 @@ import java.io.File; import java.util.Optional; -import cn.hutool.core.io.FileUtil; import cn.hutool.core.lang.Assert; import cn.hutool.core.util.StrUtil; import lombok.Getter; @@ -62,6 +62,10 @@ public Optional execute(CustomTableEnvironment tEnv) { protected StreamGraph getStreamGraph(CustomTableEnvironment tEnv) { JarSubmitParam submitParam = JarSubmitParam.build(statement); + return getStreamGraph(submitParam, tEnv); + } + + public static StreamGraph getStreamGraph(JarSubmitParam submitParam, CustomTableEnvironment tEnv) { SavepointRestoreSettings savepointRestoreSettings = StrUtil.isBlank(submitParam.getSavepointPath()) ? SavepointRestoreSettings.none() : SavepointRestoreSettings.forPath( @@ -69,7 +73,7 @@ protected StreamGraph getStreamGraph(CustomTableEnvironment tEnv) { PackagedProgram program; try { Configuration configuration = tEnv.getConfig().getConfiguration(); - File file = FileUtil.file(submitParam.getUri()); + File file = URLUtils.toFile(submitParam.getUri()); program = PackagedProgram.newBuilder() .setJarFile(file) .setEntryPointClassName(submitParam.getMainClass()) diff --git a/dinky-common/pom.xml b/dinky-common/pom.xml index 79d9414507..e4815e50a8 100644 --- a/dinky-common/pom.xml +++ b/dinky-common/pom.xml @@ -31,6 +31,11 @@ Dinky : Common + + com.amazonaws + aws-java-sdk-s3 + 1.12.589 + com.github.xiaoymin knife4j-openapi2-spring-boot-starter diff --git a/dinky-common/src/main/java/org/dinky/data/app/AppTask.java b/dinky-common/src/main/java/org/dinky/data/app/AppTask.java index 8cfaef7220..4852fee9fc 100644 --- a/dinky-common/src/main/java/org/dinky/data/app/AppTask.java +++ b/dinky-common/src/main/java/org/dinky/data/app/AppTask.java @@ -19,6 +19,8 @@ package org.dinky.data.app; +import org.dinky.config.Dialect; + import io.swagger.annotations.ApiModelProperty; import lombok.Data; @@ -33,6 +35,9 @@ public class AppTask { @ApiModelProperty(value = "Type", dataType = "String", notes = "Type of the task") private String type; + @ApiModelProperty(value = "Dialect", dataType = "Dialect", notes = "Dialect") + private Dialect dialect; + @ApiModelProperty(value = "Check Point", dataType = "Integer", example = "1", notes = "Check point for the task") private Integer checkPoint; diff --git a/dinky-admin/src/main/java/org/dinky/data/properties/OssProperties.java b/dinky-common/src/main/java/org/dinky/data/properties/OssProperties.java similarity index 100% rename from dinky-admin/src/main/java/org/dinky/data/properties/OssProperties.java rename to dinky-common/src/main/java/org/dinky/data/properties/OssProperties.java diff --git a/dinky-admin/src/main/java/org/dinky/utils/OssTemplate.java b/dinky-common/src/main/java/org/dinky/oss/OssTemplate.java similarity index 99% rename from dinky-admin/src/main/java/org/dinky/utils/OssTemplate.java rename to dinky-common/src/main/java/org/dinky/oss/OssTemplate.java index a56b2e1c5d..d4b52fcdac 100644 --- a/dinky-admin/src/main/java/org/dinky/utils/OssTemplate.java +++ b/dinky-common/src/main/java/org/dinky/oss/OssTemplate.java @@ -17,7 +17,7 @@ * */ -package org.dinky.utils; +package org.dinky.oss; import org.dinky.data.properties.OssProperties; diff --git a/dinky-core/src/main/java/org/dinky/job/JobManager.java b/dinky-core/src/main/java/org/dinky/job/JobManager.java index 19995fff7f..2aa275b321 100644 --- a/dinky-core/src/main/java/org/dinky/job/JobManager.java +++ b/dinky-core/src/main/java/org/dinky/job/JobManager.java @@ -323,15 +323,22 @@ public StreamGraph getJarStreamGraph(String statement) throws Exception { String[] statements = SqlUtil.getStatements(statement, sqlSeparator); ExecuteJarOperation executeJarOperation = null; for (int i = 0; i < statements.length; i++) { - String sqlStatement = executor.pretreatStatement(statements[i]); + String sql = statements[i]; + String sqlStatement = executor.pretreatStatement(sql); if (ExecuteJarParseStrategy.INSTANCE.match(sqlStatement)) { currentSql = sqlStatement; - executeJarOperation = new ExecuteJarOperation(statement); + executeJarOperation = new ExecuteJarOperation(sqlStatement); break; } - SqlType operationType = Operations.getOperationType(statement); + SqlType operationType = Operations.getOperationType(sqlStatement); if (operationType.equals(SqlType.ADD)) { - AddJarSqlParseStrategy.getAllFilePath(statement).forEach(executor::addJar); + AddJarSqlParseStrategy.getAllFilePath(sqlStatement).forEach(executor::addJar); + if (runMode.isApplicationMode()) { + AddJarSqlParseStrategy.getAllFilePath(sqlStatement) + .forEach(FlinkUdfPathContextHolder::addOtherPlugins); + } else { + AddJarSqlParseStrategy.getAllFilePath(sqlStatement).forEach(executor::addJar); + } } } Assert.notNull(executeJarOperation, () -> new DinkyException("Not found execute jar operation.")); @@ -366,13 +373,14 @@ public JobResult executeJarSql(String statement) throws Exception { if (gatewayResult.isSuccess()) { job.setStatus(Job.JobStatus.SUCCESS); + success(); } else { job.setStatus(Job.JobStatus.FAILED); job.setError(gatewayResult.getError()); + log.error(gatewayResult.getError()); + failed(); } } - job.setStatus(Job.JobStatus.SUCCESS); - success(); } catch (Exception e) { String error = LogUtil.getError("Exception in executing FlinkJarSQL:\n" + addLineNumber(statement), e); job.setEndTime(LocalDateTime.now()); diff --git a/dinky-flink/dinky-flink-1.14/pom.xml b/dinky-flink/dinky-flink-1.14/pom.xml index 6944fa78a2..d8114ec1da 100644 --- a/dinky-flink/dinky-flink-1.14/pom.xml +++ b/dinky-flink/dinky-flink-1.14/pom.xml @@ -85,28 +85,6 @@ org.apache.flink flink-yarn_${scala.binary.version} ${flink.version} - - - org.apache.hadoop - hadoop-yarn-common - - - org.apache.hadoop - hadoop-common - - - org.apache.hadoop - hadoop-hdfs - - - org.apache.hadoop - hadoop-yarn-client - - - org.apache.hadoop - hadoop-mapreduce-client-core - - org.apache.flink diff --git a/dinky-flink/dinky-flink-1.15/pom.xml b/dinky-flink/dinky-flink-1.15/pom.xml index f58253d0c2..70ac61a76b 100644 --- a/dinky-flink/dinky-flink-1.15/pom.xml +++ b/dinky-flink/dinky-flink-1.15/pom.xml @@ -102,28 +102,6 @@ org.apache.flink flink-yarn ${flink.version} - - - org.apache.hadoop - hadoop-yarn-common - - - org.apache.hadoop - hadoop-common - - - org.apache.hadoop - hadoop-hdfs - - - org.apache.hadoop - hadoop-yarn-client - - - org.apache.hadoop - hadoop-mapreduce-client-core - - org.apache.flink diff --git a/dinky-flink/dinky-flink-1.16/pom.xml b/dinky-flink/dinky-flink-1.16/pom.xml index 9b0bc5b3b9..b578513e71 100644 --- a/dinky-flink/dinky-flink-1.16/pom.xml +++ b/dinky-flink/dinky-flink-1.16/pom.xml @@ -17,7 +17,7 @@ 1.3.1 16.0 - 1.16.0 + 1.16.2 2.3.0 @@ -69,26 +69,6 @@ org.slf4j slf4j-api - - org.apache.hadoop - hadoop-yarn-common - - - org.apache.hadoop - hadoop-common - - - org.apache.hadoop - hadoop-hdfs - - - org.apache.hadoop - hadoop-yarn-client - - - org.apache.hadoop - hadoop-mapreduce-client-core - diff --git a/dinky-flink/dinky-flink-1.17/pom.xml b/dinky-flink/dinky-flink-1.17/pom.xml index 88bab7d2ac..8b4b7e3633 100644 --- a/dinky-flink/dinky-flink-1.17/pom.xml +++ b/dinky-flink/dinky-flink-1.17/pom.xml @@ -62,26 +62,6 @@ org.slf4j slf4j-api - - org.apache.hadoop - hadoop-yarn-common - - - org.apache.hadoop - hadoop-common - - - org.apache.hadoop - hadoop-hdfs - - - org.apache.hadoop - hadoop-yarn-client - - - org.apache.hadoop - hadoop-mapreduce-client-core - diff --git a/dinky-flink/dinky-flink-1.18/pom.xml b/dinky-flink/dinky-flink-1.18/pom.xml index 6065f7a581..2bd58a4b58 100644 --- a/dinky-flink/dinky-flink-1.18/pom.xml +++ b/dinky-flink/dinky-flink-1.18/pom.xml @@ -62,26 +62,6 @@ org.slf4j slf4j-api - - org.apache.hadoop - hadoop-yarn-common - - - org.apache.hadoop - hadoop-common - - - org.apache.hadoop - hadoop-hdfs - - - org.apache.hadoop - hadoop-yarn-client - - - org.apache.hadoop - hadoop-mapreduce-client-core - diff --git a/dinky-gateway/src/main/java/org/dinky/gateway/config/ClusterConfig.java b/dinky-gateway/src/main/java/org/dinky/gateway/config/ClusterConfig.java index 46ec26bbce..553028ae6f 100644 --- a/dinky-gateway/src/main/java/org/dinky/gateway/config/ClusterConfig.java +++ b/dinky-gateway/src/main/java/org/dinky/gateway/config/ClusterConfig.java @@ -53,7 +53,7 @@ public class ClusterConfig { dataType = "String", example = "/etc/hadoop/conf/yarn-site.xml", notes = "Path to the YARN configuration file") - private String yarnConfigPath; + private String hadoopConfigPath; @ApiModelProperty( value = "YARN application ID", @@ -68,10 +68,10 @@ public ClusterConfig(String flinkConfigPath) { this.flinkConfigPath = flinkConfigPath; } - public ClusterConfig(String flinkConfigPath, String flinkLibPath, String yarnConfigPath) { + public ClusterConfig(String flinkConfigPath, String flinkLibPath, String hadoopConfigPath) { this.flinkConfigPath = flinkConfigPath; this.flinkLibPath = flinkLibPath; - this.yarnConfigPath = yarnConfigPath; + this.hadoopConfigPath = hadoopConfigPath; } public static ClusterConfig build(String flinkConfigPath) { @@ -86,6 +86,6 @@ public static ClusterConfig build(String flinkConfigPath, String flinkLibPath, S public String toString() { return String.format( "ClusterConfig{flinkConfigPath='%s', flinkLibPath='%s', yarnConfigPath='%s', " + "appId='%s'}", - flinkConfigPath, flinkLibPath, yarnConfigPath, appId); + flinkConfigPath, flinkLibPath, hadoopConfigPath, appId); } } diff --git a/dinky-gateway/src/main/java/org/dinky/gateway/yarn/YarnApplicationGateway.java b/dinky-gateway/src/main/java/org/dinky/gateway/yarn/YarnApplicationGateway.java index cbe83c1269..a5714438c0 100644 --- a/dinky-gateway/src/main/java/org/dinky/gateway/yarn/YarnApplicationGateway.java +++ b/dinky-gateway/src/main/java/org/dinky/gateway/yarn/YarnApplicationGateway.java @@ -64,7 +64,6 @@ public GatewayResult submitJar() { AppConfig appConfig = config.getAppConfig(); configuration.set(PipelineOptions.JARS, Collections.singletonList(appConfig.getUserJarPath())); - configuration.setString( "python.files", FlinkUdfPathContextHolder.getPyUdfFile().stream() diff --git a/dinky-gateway/src/main/java/org/dinky/gateway/yarn/YarnGateway.java b/dinky-gateway/src/main/java/org/dinky/gateway/yarn/YarnGateway.java index 85ef3575cf..5c4c4298ea 100644 --- a/dinky-gateway/src/main/java/org/dinky/gateway/yarn/YarnGateway.java +++ b/dinky-gateway/src/main/java/org/dinky/gateway/yarn/YarnGateway.java @@ -55,14 +55,17 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; +import java.io.File; import java.io.IOException; import java.net.URI; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Set; import java.util.concurrent.Executors; import java.util.stream.Collectors; +import cn.hutool.core.collection.CollUtil; import cn.hutool.core.io.FileUtil; public abstract class YarnGateway extends AbstractGateway { @@ -100,8 +103,8 @@ private void initConfig() { configuration.set(YarnConfigOptions.APPLICATION_NAME, flinkConfig.getJobName()); } - if (Asserts.isNotNullString(clusterConfig.getYarnConfigPath())) { - configuration.setString(HADOOP_CONFIG, clusterConfig.getYarnConfigPath()); + if (Asserts.isNotNullString(clusterConfig.getHadoopConfigPath())) { + configuration.setString(HADOOP_CONFIG, clusterConfig.getHadoopConfigPath()); } if (configuration.containsKey(SecurityOptions.KERBEROS_LOGIN_KEYTAB.key())) { @@ -135,7 +138,7 @@ private void initYarnClient() { } private Path getYanConfigFilePath(String path) { - return new Path(URI.create(config.getClusterConfig().getYarnConfigPath() + "/" + path)); + return new Path(URI.create(config.getClusterConfig().getHadoopConfigPath() + "/" + path)); } public SavePointResult savepointCluster(String savePoint) { @@ -283,6 +286,11 @@ protected YarnClusterDescriptor createYarnClusterDescriptorWithJar() { Arrays.stream(config.getJarPaths()).map(FileUtil::file).collect(Collectors.toList())); yarnClusterDescriptor.addShipFiles(new ArrayList<>(FlinkUdfPathContextHolder.getPyUdfFile())); } + Set otherPluginsFiles = FlinkUdfPathContextHolder.getOtherPluginsFiles(); + + if (CollUtil.isNotEmpty(otherPluginsFiles)) { + yarnClusterDescriptor.addShipFiles(CollUtil.newArrayList(otherPluginsFiles)); + } return yarnClusterDescriptor; } diff --git a/dinky-web/src/pages/Metrics/Job/index.tsx b/dinky-web/src/pages/Metrics/Job/index.tsx index 9d0a5cd015..8bc81ea79f 100644 --- a/dinky-web/src/pages/Metrics/Job/index.tsx +++ b/dinky-web/src/pages/Metrics/Job/index.tsx @@ -19,9 +19,9 @@ import { ChartData, JobMetrics, MetricsLayout, SubTask, Task } from '@/pages/Metrics/Job/data'; import { - buildMetricsList, - buildRunningJobList, - buildSubTaskList + buildMetricsList, + buildRunningJobList, + buildSubTaskList } from '@/pages/Metrics/Job/function'; import { getFlinkRunTask, saveFlinkMetrics } from '@/pages/Metrics/Job/service'; import { getData } from '@/services/api'; @@ -33,274 +33,274 @@ import { useEffect, useState } from 'react'; import FlinkChart from '../../../components/FlinkChart'; const getJobMetrics = async (job: JobMetrics) => { - const url = - API_CONSTANTS.FLINK_PROXY + - '/' + - job.url + - '/jobs/' + - job.flinkJobId + - '/vertices/' + - job.subTaskId + - '/metrics' + - '?get=' + - encodeURIComponent(job.metrics); - const json = await getData(url); - json[0].time = new Date(); - return json[0] as ChartData; + const url = + API_CONSTANTS.FLINK_PROXY + + '/' + + job.url + + '/jobs/' + + job.flinkJobId + + '/vertices/' + + job.subTaskId + + '/metrics' + + '?get=' + + encodeURIComponent(job.metrics); + const json = await getData(url); + json[0].time = new Date(); + return json[0] as ChartData; }; const Job = () => { - const [metricsData, setMetricsData] = useState({ - url: '', - jid: '', - flinkName: '', - selectTaskId: 0, - selectSubTask: '', - selectMetrics: [] as string[] - }); + const [metricsData, setMetricsData] = useState({ + url: '', + jid: '', + flinkName: '', + selectTaskId: 0, + selectSubTask: '', + selectMetrics: [] as string[] + }); - const [subTaskList, setSubTaskList] = useState([]); - const [metrics, setMetrics] = useState([]); - const [taskData, setTaskData] = useState([]); - const [jobMetricsList, setJobMetricsList] = useState([]); - const [chartData, setChartData] = useState>({}); - const [layoutName, setLayoutName] = useState(''); - const [timers, setTimers] = useState>({}); + const [subTaskList, setSubTaskList] = useState([]); + const [metrics, setMetrics] = useState([]); + const [taskData, setTaskData] = useState([]); + const [jobMetricsList, setJobMetricsList] = useState([]); + const [chartData, setChartData] = useState>({}); + const [layoutName, setLayoutName] = useState(''); + const [timers, setTimers] = useState>({}); - useEffect(() => { - getFlinkRunTask().then((res) => { - setTaskData(res.data); - }); - }, []); + useEffect(() => { + getFlinkRunTask().then((res) => { + setTaskData(res.data); + }); + }, []); - useEffect(() => { - Object.keys(timers) - .filter((x) => !jobMetricsList.map((x) => x.metrics).includes(x)) - // @ts-ignore - .forEach((x) => clearInterval(timers[x])); - }, [jobMetricsList]); + useEffect(() => { + Object.keys(timers) + .filter((x) => !jobMetricsList.map((x) => x.metrics).includes(x)) + // @ts-ignore + .forEach((x) => clearInterval(timers[x])); + }, [jobMetricsList]); - /** - * query flink job detail - * @param {number} id - * @returns {Promise} - */ - const getFlinkTaskDetail = async (id: number) => { - return await getData(API_CONSTANTS.REFRESH_JOB_DETAIL, { id: id }); - }; + /** + * query flink job detail + * @param {number} id + * @returns {Promise} + */ + const getFlinkTaskDetail = async (id: number) => { + return await getData(API_CONSTANTS.REFRESH_JOB_DETAIL, { id: id }); + }; - /** - * query flink job sub task - * @param {string} url - * @param {string} jid - * @returns {Promise<[]>} - */ - const getFlinkJobSubTask = async (url: string, jid: string) => { - const flinkJobVertices = await getData(API_CONSTANTS.FLINK_PROXY + '/' + url + '/jobs/' + jid); - return flinkJobVertices.vertices as SubTask[]; - }; + /** + * query flink job sub task + * @param {string} url + * @param {string} jid + * @returns {Promise<[]>} + */ + const getFlinkJobSubTask = async (url: string, jid: string) => { + const flinkJobVertices = await getData(API_CONSTANTS.FLINK_PROXY + '/' + url + '/jobs/' + jid); + return flinkJobVertices.vertices as SubTask[]; + }; - /** - * query flink job metrics list - * @param {string} url - * @param {string} jid - * @param subTask - * @returns {Promise} - */ - const getFlinkJobMetrics = async (url: string, jid: string, subTask: string) => { - const flinkJobMetrics = await getData( - API_CONSTANTS.FLINK_PROXY + '/' + url + '/jobs/' + jid + '/vertices/' + subTask + '/metrics' - ); - return (flinkJobMetrics as any[]).map((x) => x.id as string); - }; + /** + * query flink job metrics list + * @param {string} url + * @param {string} jid + * @param subTask + * @returns {Promise} + */ + const getFlinkJobMetrics = async (url: string, jid: string, subTask: string) => { + const flinkJobMetrics = await getData( + API_CONSTANTS.FLINK_PROXY + '/' + url + '/jobs/' + jid + '/vertices/' + subTask + '/metrics' + ); + return (flinkJobMetrics as any[]).map((x) => x.id as string); + }; - /** - * 1 level , change running job - * @returns {Promise} - * @param taskId - */ - const handleRunningJobChange = async (taskId: number) => { - // query data of flink running job - const taskDetail = await getFlinkTaskDetail(taskId); - // 解构出 flink job url , job name , job id - const { - cluster: { hosts: url }, - instance: { name: flinkJobName, jid: flinkJobId } - } = taskDetail.data; - setMetricsData((prevState) => ({ - ...prevState, - url: url, - flinkName: flinkJobName, - jid: flinkJobId, - selectTaskId: taskId - })); - const subTasks = await getFlinkJobSubTask(url, flinkJobId); - setSubTaskList(subTasks); - }; + /** + * 1 level , change running job + * @returns {Promise} + * @param taskId + */ + const handleRunningJobChange = async (taskId: number) => { + // query data of flink running job + const taskDetail = await getFlinkTaskDetail(taskId); + // 解构出 flink job url , job name , job id + const { + cluster: { hosts: url }, + instance: { name: flinkJobName, jid: flinkJobId } + } = taskDetail.data; + setMetricsData((prevState) => ({ + ...prevState, + url: url, + flinkName: flinkJobName, + jid: flinkJobId, + selectTaskId: taskId + })); + const subTasks = await getFlinkJobSubTask(url, flinkJobId); + setSubTaskList(subTasks); + }; - /** - * 2 level , change subtask - * @returns {Promise} - * @param subTaskName - */ - const handleSubTaskChange = async (subTaskName: string) => { - setMetricsData((prevState) => ({ - ...prevState, - selectSubTask: subTaskName - })); - const jobMetricsDataList = await getFlinkJobMetrics( - metricsData.url, - metricsData.jid, - subTaskName - ); - setMetrics(jobMetricsDataList.sort()); - }; + /** + * 2 level , change subtask + * @returns {Promise} + * @param subTaskName + */ + const handleSubTaskChange = async (subTaskName: string) => { + setMetricsData((prevState) => ({ + ...prevState, + selectSubTask: subTaskName + })); + const jobMetricsDataList = await getFlinkJobMetrics( + metricsData.url, + metricsData.jid, + subTaskName + ); + setMetrics(jobMetricsDataList.sort()); + }; - /** - * 3 level , change metrics list - * @returns {Promise} - * @param selectList - */ - const handleMetricsChange = async (selectList: string[]) => { - setMetricsData((prevState) => ({ - ...prevState, - selectMetrics: selectList - })); + /** + * 3 level , change metrics list + * @returns {Promise} + * @param selectList + */ + const handleMetricsChange = async (selectList: string[]) => { + setMetricsData((prevState) => ({ + ...prevState, + selectMetrics: selectList + })); - const d: JobMetrics[] = selectList.map((item) => { - return { - taskId: metricsData.selectTaskId, - url: metricsData.url, - flinkJobId: metricsData.jid, - jobName: metricsData.flinkName, - subTaskId: metricsData.selectSubTask, - metrics: item, - layoutName: layoutName, - title: item, - showSize: '25%', - showType: 'Chart' - }; - }); - d.forEach((j) => { - const data: ChartData[] = []; - chartData[j.taskId + j.subTaskId + j.metrics] = data; - setChartData(chartData); - timers[j.metrics] = setInterval(() => { - getJobMetrics(j).then((res) => { - data.push(res); + const d: JobMetrics[] = selectList.map((item) => { + return { + taskId: metricsData.selectTaskId, + url: metricsData.url, + flinkJobId: metricsData.jid, + jobName: metricsData.flinkName, + subTaskId: metricsData.selectSubTask, + metrics: item, + layoutName: layoutName, + title: item, + showSize: '25%', + showType: 'Chart' + }; }); - }, 1000); - setTimers(timers); - }); - setJobMetricsList(d); - }; - /** - * render metrics card list - * @param {JobMetrics[]} metricsList - * @returns {JSX.Element} - */ - const renderMetricsCardList = (metricsList: JobMetrics[]) => { + d.forEach((j) => { + const data: ChartData[] = []; + chartData[j.taskId + j.subTaskId + j.metrics] = data; + setChartData(chartData); + timers[j.metrics] = setInterval(() => { + getJobMetrics(j).then((res) => { + data.push(res); + }); + }, 1000); + setTimers(timers); + }); + setJobMetricsList(d); + }; + /** + * render metrics card list + * @param {JobMetrics[]} metricsList + * @returns {JSX.Element} + */ + const renderMetricsCardList = (metricsList: JobMetrics[]) => { + return ( + <> + + {metricsList.map((j) => { + return ( + { + j.showSize = chartSize; + j.showType = chartType; + }} + data={chartData[j.taskId + j.subTaskId + j.metrics]} + title={j.metrics} + extraType={'size'} + /> + ); + })} + + + ); + }; + return ( - <> - - {metricsList.map((j) => { - return ( - { - j.showSize = chartSize; - j.showType = chartType; - }} - data={chartData[j.taskId + j.subTaskId + j.metrics]} - title={j.metrics} - extraType={'size'} - /> - ); - })} - - + <> + setLayoutName(e.target.value)} + style={{ width: '100vh' }} + /> + } + extra={ + + } + > + handleRunningJobChange(value as number) }} + /> + {metricsData.selectTaskId !== 0 && ( + handleSubTaskChange(value as string) }} + /> + )} + {metricsData.selectSubTask !== '' && ( + handleMetricsChange(value as string[]) }} + /> + )} + {/* render metrics list */} + {jobMetricsList.length > 0 && renderMetricsCardList(jobMetricsList)} + + ); - }; - - return ( - <> - setLayoutName(e.target.value)} - style={{ width: '100vh' }} - /> - } - extra={ - - } - > - handleRunningJobChange(value as number) }} - /> - {metricsData.selectTaskId !== 0 && ( - handleSubTaskChange(value as string) }} - /> - )} - {metricsData.selectSubTask !== '' && ( - handleMetricsChange(value as string[]) }} - /> - )} - {/* render metrics list */} - {jobMetricsList.length > 0 && renderMetricsCardList(jobMetricsList)} - - - ); }; export default Job; diff --git a/dinky-web/src/pages/RegCenter/Cluster/Configuration/components/ConfigurationModal/ConfigurationForm/YarnConfig/index.tsx b/dinky-web/src/pages/RegCenter/Cluster/Configuration/components/ConfigurationModal/ConfigurationForm/YarnConfig/index.tsx index 571a0810ed..d531b925b2 100644 --- a/dinky-web/src/pages/RegCenter/Cluster/Configuration/components/ConfigurationModal/ConfigurationForm/YarnConfig/index.tsx +++ b/dinky-web/src/pages/RegCenter/Cluster/Configuration/components/ConfigurationModal/ConfigurationForm/YarnConfig/index.tsx @@ -1,19 +1,19 @@ /* * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * */ @@ -39,7 +39,7 @@ const YarnConfig = (props: { flinkConfigOptions: DefaultOptionType[] }) => { {l('rc.cc.hadoopConfig')} { {l('rc.cc.flinkConfig')} { />