-
Notifications
You must be signed in to change notification settings - Fork 9.1k
YARN-11823: add new endpoints for getting jstacks of application and nodes #7726
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: trunk
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
/** * Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.hadoop.yarn.server.nodemanager; | ||
|
||
import org.apache.hadoop.util.Shell; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.*; | ||
import java.nio.charset.StandardCharsets; | ||
import java.nio.file.Files; | ||
import java.nio.file.StandardCopyOption; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
|
||
public class DiagnosticJStackService { | ||
|
||
private static final Logger LOG = LoggerFactory | ||
.getLogger(DiagnosticJStackService.class); | ||
private static final String PYTHON_COMMAND = "python3"; | ||
private static String scriptLocation = null; | ||
|
||
static { | ||
try { | ||
// Extract script from JAR to a temp file | ||
InputStream in = DiagnosticJStackService.class.getClassLoader() | ||
.getResourceAsStream("diagnostics/jstack_collector.py"); | ||
File tempScript = File.createTempFile("jstack_collector", ".py"); | ||
Files.copy(in, tempScript.toPath(), StandardCopyOption.REPLACE_EXISTING); | ||
tempScript.setExecutable(true); // Set execute permission | ||
scriptLocation = tempScript.getAbsolutePath(); | ||
} catch (IOException e) { | ||
LOG.error("Failed to extract Python script from JAR", e); | ||
} | ||
} | ||
|
||
public static String collectNodeThreadDump(String numberOfJStack) | ||
throws Exception { | ||
if (Shell.WINDOWS) { | ||
throw new UnsupportedOperationException("Not implemented for Windows"); | ||
} | ||
|
||
ProcessBuilder pb = createProcessBuilder(numberOfJStack); | ||
|
||
return executeCommand(pb); | ||
|
||
} | ||
|
||
|
||
|
||
public static String collectApplicationThreadDump(String appId, String numberOfJStack) | ||
throws Exception { | ||
if (Shell.WINDOWS) { | ||
throw new UnsupportedOperationException("Not implemented for Windows."); | ||
} | ||
ProcessBuilder pb = createProcessBuilder(appId, numberOfJStack); | ||
|
||
LOG.info("Diagnostic process environment: {}", pb.environment()); | ||
|
||
return executeCommand(pb); | ||
} | ||
|
||
protected static ProcessBuilder createProcessBuilder(String numberOfJStack) { | ||
List<String> commandList = | ||
new ArrayList<>(Arrays.asList(PYTHON_COMMAND, scriptLocation, numberOfJStack)); | ||
|
||
return new ProcessBuilder(commandList); | ||
} | ||
|
||
|
||
protected static ProcessBuilder createProcessBuilder(String appId, String numberOfJStack) { | ||
List<String> commandList = | ||
new ArrayList<>(Arrays.asList(PYTHON_COMMAND, scriptLocation, appId, numberOfJStack)); | ||
|
||
return new ProcessBuilder(commandList); | ||
} | ||
|
||
private static String executeCommand(ProcessBuilder pb) | ||
throws Exception { | ||
Process process = pb.start(); | ||
int exitCode; | ||
StringBuilder outputBuilder = new StringBuilder(); | ||
StringBuilder errorBuilder = new StringBuilder(); | ||
|
||
try ( | ||
BufferedReader stdoutReader = new BufferedReader(new InputStreamReader(process.getInputStream(), | ||
StandardCharsets.UTF_8)); | ||
BufferedReader stderrReader = new BufferedReader(new InputStreamReader(process.getErrorStream(), | ||
StandardCharsets.UTF_8)); | ||
) { | ||
|
||
String line; | ||
while ((line = stdoutReader.readLine()) != null) { | ||
outputBuilder.append(line).append("\n"); | ||
} | ||
|
||
while ((line = stderrReader.readLine()) != null) { | ||
errorBuilder.append(line).append("\n"); | ||
} | ||
if (!errorBuilder.toString().isEmpty()) { | ||
LOG.error("Python script stderr: {}", errorBuilder); | ||
} | ||
|
||
process.waitFor(); | ||
} catch (Exception e) { | ||
LOG.error("Error getting JStack: {}", pb.command()); | ||
throw e; | ||
} | ||
exitCode = process.exitValue(); | ||
if (exitCode != 0) { | ||
throw new IOException("The JStack collector script exited with non-zero " + | ||
"exit code: " + exitCode); | ||
} | ||
|
||
return outputBuilder.toString(); | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
Check failure on line 1 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import subprocess | ||
import sys | ||
|
||
|
||
def get_nodemanager_pid(): | ||
Check failure on line 21 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I beleive from security perspective, these should not be available in REST API in case of not secure cluster, and we should do authorisation in secured clusters. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmmmm....why is that? The script will only get java processes of the active container and execute JStack command on it, not that user could modify the script or do some malicious activities? |
||
results = run_command("ps aux | grep nodemanager | grep -v grep") | ||
# ps aux | grep nodemanager | grep -v grep | ||
# root 414 1.3 1.7 8124480 434520 ? Sl 11:36 0:52 /usr/lib/jvm/java-8-openjdk//bin/java -Dproc_nodemanager -Djava.net.preferIPv4Stack=true -Dyarn.log.dir=/opt/hadoop/logs -Dyarn.log.file=hadoop.log -Dyarn.home.dir=/opt/hadoop -Dyarn.root.logger=INFO,console -Dhadoop.log.dir=/opt/hadoop/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/opt/hadoop -Dhadoop.id.str=root -Dhadoop.root.logger=INFO,console -Dhadoop.policy.file=hadoop-policy.xml -Dhadoop.security.logger=INFO,NullAppender -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.math=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.text=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.zip=ALL-UNNAMED --add-opens=java.base/sun.security.util=ALL-UNNAMED --add-opens=java.base/sun.security.x509=ALL-UNNAMED org.apache.hadoop.yarn.server.nodemanager.NodeManager | ||
Check failure on line 24 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
pids = [] # Some host may contain more than one NodeManager | ||
for result in results.strip().splitlines(): | ||
pid = result.split()[1] | ||
pids.append(pid) | ||
|
||
return pids | ||
|
||
|
||
def get_app_pid(app_id): | ||
Check failure on line 33 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
|
||
# results= ''' | ||
# root 413 1.7 2.0 8355580 512972 ? Sl 11:21 2:56 /usr/lib/jvm/java-8-openjdk//bin/java -Dproc_nodemanager -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/opt/hadoop/logs -Dhadoop.log.file=NODEMANAGER.log -Dyarn.log.dir=/opt/hadoop/logs -Dyarn.log.file=NODEMANAGER.log -Dyarn.home.dir=/opt/hadoop -Dyarn.root.logger=INFO,DRFA -Dhadoop.home.dir=/opt/hadoop -Dhadoop.id.str=root -Dhadoop.root.logger=INFO,DRFA -Dhadoop.policy.file=hadoop-policy.xml -Dhadoop.security.logger=INFO,NullAppender -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.math=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.text=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.zip=ALL-UNNAMED --add-opens=java.base/sun.security.util=ALL-UNNAMED --add-opens=java.base/sun.security.x509=ALL-UNNAMED --enable-native-access=ALL-UNNAMED org.apache.hadoop.yarn.server.nodemanager.NodeManager | ||
Check failure on line 36 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
# root 41611 4.1 1.9 2414568 470660 ? Sl 14:08 0:16 /usr/lib/jvm/java-8-openjdk//bin/java -Xmx750m org.apache.hadoop.yarn.applications.distributedshell.ApplicationMaster --container_type GUARANTEED --container_memory 750 --container_vcores 1 --num_containers 500 --priority 0 --appname DistributedShell --homedir hdfs://namenode:9000/user/root | ||
Check failure on line 37 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
# ''' | ||
results = run_command("ps aux | grep jvm/java | grep -v -e /bin/bash -e grep") # TODO: later include "grep app_id" for long java application like mapreduce | ||
Check failure on line 39 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
pids = [] | ||
for result in results.strip().splitlines(): | ||
pid = result.split()[1] | ||
pids.append(pid) | ||
|
||
return pids | ||
|
||
|
||
def execute_jstack(pids, number_of_jstack): | ||
Check failure on line 48 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
all_jstacks = [] | ||
|
||
for pid in pids: | ||
for i in range(number_of_jstack): # Get multiple jstack | ||
jstack_output = run_command("jstack", pid) | ||
all_jstacks.append("--- JStack iteration-{} for PID: {} ---\n{}".format(i, pid, jstack_output)) | ||
Check failure on line 54 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
|
||
return "\n".join(all_jstacks) | ||
|
||
|
||
def run_command(*argv): | ||
Check failure on line 59 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
try: | ||
cmd = " ".join(arg for arg in argv) | ||
print("Running command with arguments:", cmd) | ||
response = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, check=True) | ||
Check failure on line 63 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
response_str = response.stdout.decode('utf-8') | ||
except subprocess.CalledProcessError as e: | ||
Check failure on line 65 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
response_str = "Unable to run command: {}".format(e) | ||
print(response_str, file=sys.stderr) | ||
except Exception as e: | ||
Check failure on line 68 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
response_str = "Exception occurred: {}".format(e) | ||
print(response_str, file=sys.stderr) | ||
|
||
return response_str | ||
|
||
|
||
def main(): | ||
Check failure on line 75 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
|
||
# app_id = "application_1748517687882_0013" | ||
|
||
if "app" in sys.argv[0] > 1: | ||
app_id = sys.argv[1] | ||
number_of_jstack= int(sys.argv[2]) | ||
pids = get_app_pid(app_id) | ||
else: | ||
pids = get_nodemanager_pid() | ||
number_of_jstack = int(sys.argv[1]) | ||
|
||
if not pids: | ||
print("No active process id in this NodeManager.") | ||
sys.exit(0) | ||
|
||
jstacks = execute_jstack(pids, number_of_jstack) | ||
print(jstacks) # The Initiated java processBuilder will read this stdout | ||
|
||
|
||
if __name__ == "__main__": | ||
main() | ||
|
||
Check failure on line 97 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This static block will block the NM to start up, till it is not done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
According to my testing, it is very fast when I access the JStack endpoint. Do you happen to have a better idea of getting the script file from /resources folder?