diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DiagnosticJStackService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DiagnosticJStackService.java new file mode 100644 index 0000000000000..bb6fae0eda2a7 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DiagnosticJStackService.java @@ -0,0 +1,134 @@ +/** * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager; + +import org.apache.hadoop.util.Shell; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class DiagnosticJStackService { + + private static final Logger LOG = LoggerFactory + .getLogger(DiagnosticJStackService.class); + private static final String PYTHON_COMMAND = "python3"; + private static String scriptLocation = null; + + static { + try { + // Extract script from JAR to a temp file + InputStream in = DiagnosticJStackService.class.getClassLoader() + .getResourceAsStream("diagnostics/jstack_collector.py"); + File tempScript = File.createTempFile("jstack_collector", ".py"); + Files.copy(in, tempScript.toPath(), StandardCopyOption.REPLACE_EXISTING); + tempScript.setExecutable(true); // Set execute permission + scriptLocation = tempScript.getAbsolutePath(); + } catch (IOException e) { + LOG.error("Failed to extract Python script from JAR", e); + } + } + + public static String collectNodeThreadDump(String numberOfJStack) + throws Exception { + if (Shell.WINDOWS) { + throw new UnsupportedOperationException("Not implemented for Windows"); + } + + ProcessBuilder pb = createProcessBuilder(numberOfJStack); + + return executeCommand(pb); + + } + + + + public static String collectApplicationThreadDump(String appId, String numberOfJStack) + throws Exception { + if (Shell.WINDOWS) { + throw new UnsupportedOperationException("Not implemented for Windows."); + } + ProcessBuilder pb = createProcessBuilder(appId, numberOfJStack); + + LOG.info("Diagnostic process environment: {}", pb.environment()); + + return executeCommand(pb); + } + + protected static ProcessBuilder createProcessBuilder(String numberOfJStack) { + List commandList = + new ArrayList<>(Arrays.asList(PYTHON_COMMAND, scriptLocation, numberOfJStack)); + + return new ProcessBuilder(commandList); + } + + + protected static ProcessBuilder createProcessBuilder(String appId, String numberOfJStack) { + List commandList = + new ArrayList<>(Arrays.asList(PYTHON_COMMAND, scriptLocation, appId, numberOfJStack)); + + return new ProcessBuilder(commandList); + } + + private static String executeCommand(ProcessBuilder pb) + throws Exception { + Process process = pb.start(); + int exitCode; + StringBuilder outputBuilder = new StringBuilder(); + StringBuilder errorBuilder = new StringBuilder(); + + try ( + BufferedReader stdoutReader = new BufferedReader(new InputStreamReader(process.getInputStream(), + StandardCharsets.UTF_8)); + BufferedReader stderrReader = new BufferedReader(new InputStreamReader(process.getErrorStream(), + StandardCharsets.UTF_8)); + ) { + + String line; + while ((line = stdoutReader.readLine()) != null) { + outputBuilder.append(line).append("\n"); + } + + while ((line = stderrReader.readLine()) != null) { + errorBuilder.append(line).append("\n"); + } + if (!errorBuilder.toString().isEmpty()) { + LOG.error("Python script stderr: {}", errorBuilder); + } + + process.waitFor(); + } catch (Exception e) { + LOG.error("Error getting JStack: {}", pb.command()); + throw e; + } + exitCode = process.exitValue(); + if (exitCode != 0) { + throw new IOException("The JStack collector script exited with non-zero " + + "exit code: " + exitCode); + } + + return outputBuilder.toString(); + } + +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NMWebServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NMWebServices.java index 6cfd43cd13acd..ddee07c3c154d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NMWebServices.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NMWebServices.java @@ -31,12 +31,14 @@ import java.util.Set; import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.yarn.server.nodemanager.DiagnosticJStackService; import org.apache.hadoop.yarn.server.nodemanager.containermanager.records.AuxServiceRecord; import org.apache.hadoop.yarn.server.nodemanager.containermanager.records.AuxServiceRecords; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.AuxiliaryServicesInfo; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo; +import org.apache.hadoop.yarn.webapp.WebAppException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -271,6 +273,37 @@ public ContainerInfo getNodeContainer(@javax.ws.rs.core.Context } + @GET + @Path("/jstack/{numberOfJStack}") + public Response getNodeThreadDump(@PathParam("numberOfJStack") String numberOfJStack) + { // Make sure the NodeManager have python3 install + try { + return Response.status(Status.OK) + .entity(DiagnosticJStackService.collectNodeThreadDump(numberOfJStack)) + .build(); + } catch (Exception e) { + throw new WebAppException("Error collection NodeManager JStack: " + e.getMessage() + ". " + + "For more information please check the NodeManager logs."); + } + } + + + @GET + @Path("/apps/{appid}/jstack/{numberOfJStack}") + @Produces({MediaType.TEXT_PLAIN}) + public Response getApplicationJStack(@PathParam("appid") String appId, + @PathParam("numberOfJStack") String numberOfJStack) + { // Make sure the NodeManager have python3 install + try { + return Response.status(Status.OK) + .entity(DiagnosticJStackService.collectApplicationThreadDump(appId, numberOfJStack)) + .build(); + } catch (Exception e) { + throw new WebAppException("Error collecting Application JStack: " + e.getMessage() + ". " + + "For more information please check the NodeManager logs."); + } + } + /** * Returns log file's name as well as current file size for a container. * diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py new file mode 100644 index 0000000000000..ce215ecc377d1 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess +import sys + + +def get_nodemanager_pid(): + results = run_command("ps aux | grep nodemanager | grep -v grep") + # ps aux | grep nodemanager | grep -v grep + # root 414 1.3 1.7 8124480 434520 ? Sl 11:36 0:52 /usr/lib/jvm/java-8-openjdk//bin/java -Dproc_nodemanager -Djava.net.preferIPv4Stack=true -Dyarn.log.dir=/opt/hadoop/logs -Dyarn.log.file=hadoop.log -Dyarn.home.dir=/opt/hadoop -Dyarn.root.logger=INFO,console -Dhadoop.log.dir=/opt/hadoop/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/opt/hadoop -Dhadoop.id.str=root -Dhadoop.root.logger=INFO,console -Dhadoop.policy.file=hadoop-policy.xml -Dhadoop.security.logger=INFO,NullAppender -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.math=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.text=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.zip=ALL-UNNAMED --add-opens=java.base/sun.security.util=ALL-UNNAMED --add-opens=java.base/sun.security.x509=ALL-UNNAMED org.apache.hadoop.yarn.server.nodemanager.NodeManager + pids = [] # Some host may contain more than one NodeManager + for result in results.strip().splitlines(): + pid = result.split()[1] + pids.append(pid) + + return pids + + +def get_app_pid(app_id): + + # results= ''' + # root 413 1.7 2.0 8355580 512972 ? Sl 11:21 2:56 /usr/lib/jvm/java-8-openjdk//bin/java -Dproc_nodemanager -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/opt/hadoop/logs -Dhadoop.log.file=NODEMANAGER.log -Dyarn.log.dir=/opt/hadoop/logs -Dyarn.log.file=NODEMANAGER.log -Dyarn.home.dir=/opt/hadoop -Dyarn.root.logger=INFO,DRFA -Dhadoop.home.dir=/opt/hadoop -Dhadoop.id.str=root -Dhadoop.root.logger=INFO,DRFA -Dhadoop.policy.file=hadoop-policy.xml -Dhadoop.security.logger=INFO,NullAppender -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.math=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.text=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.zip=ALL-UNNAMED --add-opens=java.base/sun.security.util=ALL-UNNAMED --add-opens=java.base/sun.security.x509=ALL-UNNAMED --enable-native-access=ALL-UNNAMED org.apache.hadoop.yarn.server.nodemanager.NodeManager + # root 41611 4.1 1.9 2414568 470660 ? Sl 14:08 0:16 /usr/lib/jvm/java-8-openjdk//bin/java -Xmx750m org.apache.hadoop.yarn.applications.distributedshell.ApplicationMaster --container_type GUARANTEED --container_memory 750 --container_vcores 1 --num_containers 500 --priority 0 --appname DistributedShell --homedir hdfs://namenode:9000/user/root + # ''' + results = run_command("ps aux | grep jvm/java | grep -v -e /bin/bash -e grep") # TODO: later include "grep app_id" for long java application like mapreduce + pids = [] + for result in results.strip().splitlines(): + pid = result.split()[1] + pids.append(pid) + + return pids + + +def execute_jstack(pids, number_of_jstack): + all_jstacks = [] + + for pid in pids: + for i in range(number_of_jstack): # Get multiple jstack + jstack_output = run_command("jstack", pid) + all_jstacks.append("--- JStack iteration-{} for PID: {} ---\n{}".format(i, pid, jstack_output)) + + return "\n".join(all_jstacks) + + +def run_command(*argv): + try: + cmd = " ".join(arg for arg in argv) + print("Running command with arguments:", cmd) + response = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, check=True) + response_str = response.stdout.decode('utf-8') + except subprocess.CalledProcessError as e: + response_str = "Unable to run command: {}".format(e) + print(response_str, file=sys.stderr) + except Exception as e: + response_str = "Exception occurred: {}".format(e) + print(response_str, file=sys.stderr) + + return response_str + + +def main(): + + # app_id = "application_1748517687882_0013" + + if "app" in sys.argv[0] > 1: + app_id = sys.argv[1] + number_of_jstack= int(sys.argv[2]) + pids = get_app_pid(app_id) + else: + pids = get_nodemanager_pid() + number_of_jstack = int(sys.argv[1]) + + if not pids: + print("No active process id in this NodeManager.") + sys.exit(0) + + jstacks = execute_jstack(pids, number_of_jstack) + print(jstacks) # The Initiated java processBuilder will read this stdout + + +if __name__ == "__main__": + main() +