From a60921ba8fa3ca5cec537ec4145dacd88d1f2b48 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Sun, 7 Jul 2024 17:58:34 +0200 Subject: [PATCH] Troubleshooting: Rework section, add dedicated pages for jcmd, JFR, CFR --- docs/admin/troubleshooting/cfr.md | 96 ++++++++++ docs/admin/troubleshooting/crate-node.rst | 104 ++++++----- docs/admin/troubleshooting/index.md | 106 ++++++----- .../{docker-jcmd.rst => jcmd/docker.rst} | 165 ++++++++---------- docs/admin/troubleshooting/jcmd/index.md | 134 ++++++++++++++ docs/admin/troubleshooting/jcmd/jfr.md | 89 ++++++++++ docs/admin/troubleshooting/system-tables.rst | 96 +++++----- docs/conf.py | 5 + 8 files changed, 563 insertions(+), 232 deletions(-) create mode 100644 docs/admin/troubleshooting/cfr.md rename docs/admin/troubleshooting/{docker-jcmd.rst => jcmd/docker.rst} (61%) create mode 100644 docs/admin/troubleshooting/jcmd/index.md create mode 100644 docs/admin/troubleshooting/jcmd/jfr.md diff --git a/docs/admin/troubleshooting/cfr.md b/docs/admin/troubleshooting/cfr.md new file mode 100644 index 00000000..0693b82c --- /dev/null +++ b/docs/admin/troubleshooting/cfr.md @@ -0,0 +1,96 @@ +(cfr)= +# CrateDB Flight Recorder (CFR) + +:::{rubric} About +::: +In a similar spirit like the [](#jfr), CFR helps to collect information about +CrateDB clusters for support requests and self-service debugging. + +CFR is a utility application to acquire and export diagnostic information from +CrateDB's [system tables](#systables) into an archive file. You can transmit +this file to support engineers, in order to optimally convey relevant +information about your cluster, mostly for debugging and troubleshooting +purposes. + +:::{rubric} Details +::: +The CrateDB Flight Recorder (CFR) is an ETL application dumping all database +tables in the `sys` schema into a timestamped tarball archive file. +On the receiving end, the recording can be imported into another CrateDB +instance, in order to inspect and analyze it. + +Flight recordings can be started against any running CrateDB cluster at runtime. +The utility connects to CrateDB like a regular client, talking SQL. +CFR is part of the CrateDB Toolkit (`ctk cfr`), and is also available as a +standalone application `cratedb-cfr(.exe)`. + + +## Synopsis + +:Export: + + `cratedb-cfr sys-export` invokes the export operation. + +:Import: + + `cratedb-cfr sys-import` invokes the import operation. + + +## Install + +Select one of the standalone application bundles, matching the platform +and architecture of the corresponding system where you intend to run CFR. + +::::{grid} 1 2 2 2 + +:::{grid-item-card} {material-outlined}`download_for_offline;1.4em` Linux x64 +:link: https://github.com/crate-workbench/cratedb-toolkit/actions/runs/9826830191/artifacts/1674929097 +:link-alt: CFR for Linux x64 +:padding: 0 +:class-title: sd-fs-5 ++++ +cratedb-cfr-linux-x64.zip +::: + +:::{grid-item-card} {material-outlined}`download_for_offline;1.4em` macOS x64 +:link: https://github.com/crate-workbench/cratedb-toolkit/actions/runs/9826830191/artifacts/1674929134 +:link-alt: CFR for macOS x64 +:padding: 0 +:class-title: sd-fs-5 ++++ +cratedb-cfr-macos-x64.zip +::: + +:::{grid-item-card} {material-outlined}`download_for_offline;1.4em` Windows x64 +:link: https://github.com/crate-workbench/cratedb-toolkit/actions/runs/9826830191/artifacts/1674930132 +:link-alt: CFR for Windows x64 +:padding: 0 +:class-title: sd-fs-5 ++++ +cratedb-cfr-windows-x64.zip +::: + +:::{grid-item-card} {material-outlined}`download_for_offline;1.4em` macOS ARM64 +:link: https://github.com/crate-workbench/cratedb-toolkit/actions/runs/9826830191/artifacts/1674927962 +:link-alt: CFR for macOS ARM64 +:padding: 0 +:class-title: sd-fs-5 ++++ +cratedb-cfr-macos-arm64.zip +::: + +:::: + + + +## Learn + +:::{card} {material-outlined}`library_books;1.6em` CrateDB Cluster Flight Recorder (CFR) +:link: ctk:cfr +:link-type: ref +Learn about the concepts of CFR, and how to use it. +::: + + +[Java Flight Recorder]: https://en.wikipedia.org/wiki/JDK_Flight_Recorder +[jcmd]: https://docs.oracle.com/en/java/javase/17/docs/specs/man/jcmd.html diff --git a/docs/admin/troubleshooting/crate-node.rst b/docs/admin/troubleshooting/crate-node.rst index 3b099753..8343f514 100644 --- a/docs/admin/troubleshooting/crate-node.rst +++ b/docs/admin/troubleshooting/crate-node.rst @@ -2,17 +2,17 @@ .. _use-crate-node: -=============================================== -Troubleshooting with the ``crate-node`` command -=============================================== +========================== +The ``crate-node`` command +========================== -This document shows you how to troubleshoot CrateDB nodes with the -`crate-node`_ command. Using this command, you can: +Use the `crate-node`_ command to troubleshoot CrateDB cluster nodes. +Using this command, you can: -* Repurpose nodes and clean up their old data +* Repurpose nodes and clean up their old data. * Force the election of a master node (and the creation of a new cluster) in - the event that you lose too many nodes to be able to form a quorum -* Detach nodes from an old cluster so they can be moved to a new cluster + the event that you lose too many nodes to be able to form a quorum. +* Detach nodes from an old cluster so they can be moved to a new cluster. .. rubric:: Table of contents @@ -28,38 +28,35 @@ This document shows you how to troubleshoot CrateDB nodes with the Repurpose a node ================ +.. rubric:: About + In a situation where you have irrecoverably lost the majority of the master-eligible nodes in a cluster, you may need to form a new cluster. - When forming a new cluster, you may have to change the `role`_ of one or more nodes. Changing the role of a node is referred to as *repurposing* a node. Each node checks the contents of its :ref:`data path ` -at startup. If CrateDB -discovers unexpected data, it will refuse to start. Specifically: +at startup. If CrateDB discovers unexpected data, it will refuse to start. +The specific rules are: - Nodes configured with `node.data`_ set to ``false`` will refuse to start if - they find any shard data at startup + they find any shard data at startup. - Nodes configured with both `node.master`_ set to ``false`` and `node.data`_ set to ``false`` will refuse to start if they have any index metadata at - startup + startup. The `crate-node`_ :ref:`repurpose command ` -can help you clean up the necessary -node data so that CrateDB can be restarted with a new role. +can help you clean up the necessary node data, so that CrateDB can be restarted +with a new role. - -Procedure ---------- +.. rubric:: Procedure To repurpose a node, first of all, you must stop the node. - Then, update the settings `node.data`_ and `node.master`_ in the ``crate.yml`` :ref:`configuration file ` as needed. - The ``node.data`` and ``node.master`` settings can be configured in four -different ways, each corresponding to a different type of node: +different ways, each corresponding to a different type of node. +-------------------+------------------------+-----------------------------+ | Role | Configuration | After repurposing | @@ -95,7 +92,7 @@ deleted (i.e., "cleaned up") after repurposing the node to that configuration. Before running the ``repurpose`` command, make sure that any data you want to keep is available on other nodes in the cluster. -Then, run the ``repurpose`` command: +Then, invoke the ``repurpose`` command. .. code-block:: console @@ -112,15 +109,14 @@ Then, run the ``repurpose`` command: Node successfully repurposed to master and no data. As mentioned in the command output, you can pass in ``-v`` to get a more -verbose output, like so: +verbose output. .. code-block:: console sh$ ./bin/crate-node repurpose -v -Finally, start the node again. - -The node has been successfully repurposed. +Finally, start the node again. After that, the node has been successfully +repurposed. .. _crate-node-unsafe-bootstrap: @@ -128,17 +124,21 @@ The node has been successfully repurposed. Perform an unsafe cluster bootstrap =================================== +.. rubric:: About + When communication is lost between one or more nodes in a cluster (e.g., during -a *cluster partition*), the situation is assumed to be temporary and safeguards +a `network partition`_), the situation is assumed to be temporary and safeguards exist to prevent the election of a master node unless a `quorum`_ can be established. However, if the situation is permanent (i.e., you have irrecoverably lost a -majority of the nodes in your cluster), you will need to force the election of +majority of the nodes in your cluster), also known as a `split-brain`_ situation, +you will need to force the election of a master. Forcing a master election without quorum is referred to as an *unsafe cluster bootstrap*. -The `crate-node`_ ``unsafe-bootstrap`` command can help you choose a new master +The :ref:`unsafe-bootstrap command ` +can support you to choose a new master node and subsequently perform an unsafe cluster bootstrap. .. WARNING:: @@ -160,8 +160,7 @@ node and subsequently perform an unsafe cluster bootstrap. have access to the file system. -Procedure ---------- +.. rubric:: Procedure Before you continue, you must stop all master-eligible nodes in the cluster. @@ -175,12 +174,11 @@ Before you continue, you must stop all master-eligible nodes in the cluster. Once all master-eligible nodes in the cluster have been stopped, you can manually select a new master. -To help you select a new master, the ``unsafe-bootstrap`` command returns -information about the node cluster state as a pair of values in the form -*(term, version)*. - +To support you selecting a new master node, the ``unsafe-bootstrap`` command +returns information about the node cluster state as a pair of values in the +form *(term, version)*. You can gather this information (safely) by issuing the ``unsafe-bootstrap`` -command and answering "no" (``n``) at the confirmation prompt, like so: +command and answering "no" (``n``) at the confirmation prompt. .. code-block:: console @@ -211,8 +209,8 @@ value, select any one of them. that you elect a master node with the freshest state data. This, in turn, minimizes the potential for data loss and inconsistency. -Once you have selected a node to elect to master, run the ``unsafe-bootstrap`` -command on that node and answer yes (``y``) at the confirmation prompt: +Once you have selected a node to elect to master, invoke the ``unsafe-bootstrap`` +command on that node and answer yes (``y``) at the confirmation prompt. .. code-block:: console @@ -226,31 +224,27 @@ command on that node and answer yes (``y``) at the confirmation prompt: Confirm [y/N] y -If the operation was successful, the command will output: +If the operation was successful, the program will acknowledge it. +**Note:** This success message indicates that the operation was completed. +You may still experience data loss and inconsistencies. .. code-block:: console Master node was successfully bootstrapped -.. NOTE:: - - This success message indicates that the operation was completed. You may - still experience data loss and inconsistencies. - -Start the bootstrapped node and verify that it has started a new cluster with +Now, start the bootstrapped node and verify that it has started a new cluster with one node and elected itself as the master. Before you can add the rest of the nodes to the new cluster, you must detach them from the old cluster (see the :ref:`next section `). -When that's done, start the nodes and verify that they join the new cluster. +After that's done, start the nodes and verify that they join the new cluster. .. NOTE:: Once the new cluster is up-and-running and all recoveries are complete, you - are responsible for assessing the cluster for data loss and - inconsistencies. + are advised to assess the database for data loss and inconsistencies. .. _crate-node-detach-cluster: @@ -258,6 +252,8 @@ When that's done, start the nodes and verify that they join the new cluster. Detach a node from its cluster ============================== +.. rubric:: About + To protect nodes from inadvertently rejoining the wrong cluster (e.g., in the event of a network partition), each node binds to the first cluster it joins. @@ -265,7 +261,8 @@ However, if a cluster has permanently failed (see the :ref:`previous section `) you must detach nodes before you can move them to a a new cluster. -The `crate-node`_ ``detach-cluster`` command can help you move a node to a new +The :ref:`detach-cluster command ` +supports you moving a node to a new cluster by resetting the cluster it is bound to (i.e., *detaching* it from its existing cluster). @@ -278,8 +275,7 @@ existing cluster). cluster bootstrap `. -Procedure ---------- +.. rubric:: Procedure To detach a node, run: @@ -293,7 +289,7 @@ To detach a node, run: Confirm [y/N] y -You should see this: +A corresponding message confirms success. .. code-block:: console @@ -304,14 +300,16 @@ When the node is started again, it will be able to join a new cluster. .. NOTE:: You may also have to update the :ref:`discovery configuration - ` so that + `, so that nodes are able to find the new cluster. .. _crate-node: https://cratedb.com/docs/crate/reference/en/latest/cli-tools.html#cli-crate-node .. _data path: https://cratedb.com/docs/crate/reference/en/latest/config/environment.html#application-variables +.. _network partition: https://en.wikipedia.org/wiki/Network_partition .. _node.data: https://cratedb.com/docs/crate/reference/en/latest/config/node.html#node-types .. _node.master: https://cratedb.com/docs/crate/reference/en/latest/config/node.html#node-types .. _quorum: https://cratedb.com/docs/crate/reference/en/latest/concepts/clustering.html#master-node-election .. _role: https://cratedb.com/docs/crate/reference/en/latest/config/node.html#node-types +.. _split-brain: https://en.wikipedia.org/wiki/Split-brain_(computing) .. _UUID: https://en.wikipedia.org/wiki/Universally_unique_identifier diff --git a/docs/admin/troubleshooting/index.md b/docs/admin/troubleshooting/index.md index 20112428..f0e0f1b1 100644 --- a/docs/admin/troubleshooting/index.md +++ b/docs/admin/troubleshooting/index.md @@ -1,74 +1,96 @@ -(troubleshooting)= +--- +myst_ref_domains: ["std:ref"] +--- +(troubleshooting)= # Troubleshooting -A collection of guides and tools for troubleshooting CrateDB. +A collection of guides, procedures, and utilities for troubleshooting CrateDB +clusters. -You will learn how to apply self-service measurements supporting you when +You will learn how to apply self-service measures supporting you when observing problems with your CrateDB database cluster. If you need help from others, feel free to reach out to our community or commercial [support channels][support] any time. -:::{tip} +:::{rubric} General Recommendations +::: Relevant suggestions will support you to optimally gather and convey information about your database cluster, which helps others to get a -rough orientation about its size, and other technical characteristics. -Before [contacting support][support], you optimally have this information -ready. -::: +good orientation about its technical details like cluster size, or data +volumes. Before [contacting support][support], you optimally have this +information ready. -:::{note} -The guidelines may not exclusively be applicable for users of CrateDB Cloud, -or when running CrateDB on a commercial contract, where additional -communication channels apply. - -Independently from support channels and status, significant parts -of the instructions outlined here will help our support engineers to discover -any sort of problem with your CrateDB cluster much faster than asking -individual questions, so we generally recommend to use -this guideline as a checklist in all situations. -::: +We recommend to use those guidelines as a checklist in all situations, in order +to help our support engineers to discover any sort of problem with your CrateDB +clusters much faster than needing to ask individual questions. +**Note:** Additional communication channels apply for users of CrateDB Cloud, or +when running CrateDB on a commercial contract. -## Introduction +:::{rubric} System Tables +::: Many details about the status of your cluster is included within CrateDB's [system tables]. The `sys` schema includes synthetic read-only tables which -can be queried to get statistical real-time information about the cluster's -metadata, like information about nodes and shards. The [](#systables) section -illustrates how to use them. - -The sections below include instructions about how to use the [crate-node] -and [jcmd] utility and diagnosis programs, and information about -using the [Java Flight Recorder] for [monitoring Java applications]. +can be queried to get real-time information about the cluster's metadata, +like information about nodes and shards. + +:::{card} {material-outlined}`wysiwyg;1.6em` Troubleshooting with system tables +:link: systables +:link-type: ref +How to use CrateDB's system tables to investigate the database cluster status, +and debug issues. +::: -## Instructions +:::{rubric} Diagnostic Utilities +::: +Instructions about how to use relevant utility and diagnosis programs. -:::{todo} -- 🚧 A concise step-by-step checklist could be provided here, with - optional tool support. -- 🧹 The "table of contents" section below may be removed. +:::{card} {material-outlined}`wysiwyg;1.6em` About CFR +:link: cfr +:link-type: ref +The CrateDB Flight Recorder (CFR) collects information from CrateDB's system tables, +and bundles it into an archive file ready to share with our support engineers. ::: +:::{card} {material-outlined}`wysiwyg;1.6em` About `jcmd` +:link: jcmd +:link-type: ref +The jcmd utility is the traditional application to inquire diagnostics information +from software running on the JVM. It also includes the Java Flight Recorder (JFR). +::: -:::{rubric} Table of contents +:::{card} {material-outlined}`wysiwyg;1.6em` About `crate-node` +:link: use-crate-node +:link-type: ref +A utility program to interact with a CrateDB cluster for conducting +infrastructure operations. For example: +- Control the master node election process. +- Repurpose nodes: Detach and move between clusters. +- Clean up stale node data. ::: -:::{toctree} -:maxdepth: 2 +:::{note} +You can find a lot of troubleshooting guides that explain how to perform +diagnostics on Java applications. -using system tables -using crate-node -using jcmd on Docker +Because CrateDB is written in Java, any of those tools can be used to troubleshoot +CrateDB instances. Above, we are focusing on canonical and traditional utilities, +for example jcmd. ::: +:::{toctree} +:hidden: +System Tables +cfr +jcmd/jfr +The jcmd Utility +crate-node +::: -[crate-node]: inv:crate-reference#cli-crate-node -[Java Flight Recorder]: https://en.wikipedia.org/wiki/JDK_Flight_Recorder -[jcmd]: https://www.baeldung.com/running-jvm-diagnose -[monitoring Java applications]: https://www.baeldung.com/java-flight-recorder-monitoring [support]: https://cratedb.com/support [system tables]: inv:crate-reference#system-information diff --git a/docs/admin/troubleshooting/docker-jcmd.rst b/docs/admin/troubleshooting/jcmd/docker.rst similarity index 61% rename from docs/admin/troubleshooting/docker-jcmd.rst rename to docs/admin/troubleshooting/jcmd/docker.rst index d76904e5..7bcf4db5 100644 --- a/docs/admin/troubleshooting/docker-jcmd.rst +++ b/docs/admin/troubleshooting/jcmd/docker.rst @@ -1,75 +1,63 @@ .. _jcmd-docker: -=============================================== -Troubleshooting CrateDB with ``jcmd`` on Docker -=============================================== +===================================== +Using ``jcmd`` with CrateDB on Docker +===================================== -You can find a lot of troubleshooting guides for Java applications out there on -the internet that explain how to perform a heap dump, thread dump, and so on. -Since CrateDB is written in Java, these tools can of course also be used to -troubleshoot CrateDB instances in case something goes awry. - -Most of these guides, however, explain how to use tools (such as ``jcmd``) on -Java applications running directly as process on the operating system. Fewer of +Most the guides about Java diagnostic utilities explain how to use them on +applications running directly as process on the operating system. Fewer of them cover how to apply ``jcmd`` commands inside a Docker container. -When it comes to troubleshooting the ``crate`` Docker container, things work a -bit differently. This document explains why the 'usual' way to run ``jcmd`` -does not work and how to solve it. It does not, however, explain how to analyze -the output (since that is identical to non-containerized applications)! +.. rubric:: Introduction -.. rubric:: Table of contents +When it comes to troubleshooting a CrateDB instance running inside a container +like Docker, things work a bit differently. +This document explains why the standard way to run ``jcmd`` does not work, and +how to solve it. -.. contents:: - :local: +.. note:: + The document does not explain how to analyze the output, because that is + identical to non-containerized applications. -Introduction -============ -``jcmd`` has been the successor of multiple tools (``jstack``, ``jinfo``, -``jmap``) since JDK 8. It can be used to perform various diagnostic tasks on a -running Java application. +.. rubric:: Table of contents -.. code-block:: console +.. contents:: + :local: + +Run ``jcmd`` inside container +============================= - $ /crate/jdk/bin/jcmd -h - Usage: jcmd - or: jcmd -l - or: jcmd -h +The commands below use ``docker``. In the same spirit, you can also use +``podman``. - command must be a valid jcmd command for the selected jvm. - Use the command "help" to see which commands are available. - If the pid is 0, commands will be sent to all Java processes. - The main class argument will be used to match (either partially - or fully) the class used to start Java. - If no options are given, lists Java processes (same as -l). +.. rubric:: Problem - PerfCounter.print display the counters exposed by this process - -f read and execute commands from the file - -l list JVM processes on the local machine - -? -h --help print this help message +After starting a ``cratedb`` Docker container, +.. code-block:: console -Running inside ``crate`` Docker container -========================================= + docker run --rm -it --name=cratedb \ + --publish=4200:4200 --publish=5432:5432 \ + --env=CRATE_HEAP_SIZE=2g crate/crate:nightly \ + -Cdiscovery.type=single-node -After starting a ``crate`` Docker container you can also run ``jcmd`` inside -the container. +you can also run ``jcmd`` inside the container. .. code-block:: console - $ docker exec -ti b768001196c /bin/bash - [root@b768001196ce data]# /crate/jdk/bin/jcmd -l + $ docker exec -ti cratedb /bin/bash + [root@cratedb data]# /crate/jdk/bin/jcmd -l 1 io.crate.bootstrap.CrateDB -Cpath.home=/crate -Cnode.name=debug 106 jdk.jcmd/sun.tools.jcmd.JCmd -l -However, when trying to run any command, the command fails, even though you run -it as ``root`` with full privileges. +However, when trying to run any command, it fails, even though you are invoking +the program as ``root`` with full privileges. .. code-block:: console - [root@b768001196ce data]# /crate/jdk/bin/jcmd 1 VM.version + [root@cratedb data]# /crate/jdk/bin/jcmd 1 VM.version 1: com.sun.tools.attach.AttachNotSupportedException: Unable to open socket file /proc/1/root/tmp/.java_pid1: target process 1 doesn't respond within 10500ms or HotSpot VM not loaded at jdk.attach/sun.tools.attach.VirtualMachineImpl.(VirtualMachineImpl.java:100) @@ -79,11 +67,11 @@ it as ``root`` with full privileges. at jdk.jcmd/sun.tools.jcmd.JCmd.main(JCmd.java:99) The same happens when you try to run it as user ``crate``, which owns the -process: +process. .. code-block:: console - [root@b768001196ce data]# su crate -c "/crate/jdk/bin/jcmd 1 VM.version" + [root@cratedb data]# su crate -c "/crate/jdk/bin/jcmd 1 VM.version" 1: com.sun.tools.attach.AttachNotSupportedException: Unable to open socket file /proc/1/root/tmp/.java_pid1: target process 1 doesn't respond within 10500ms or HotSpot VM not loaded at jdk.attach/sun.tools.attach.VirtualMachineImpl.(VirtualMachineImpl.java:100) @@ -92,13 +80,12 @@ process: at jdk.jcmd/sun.tools.jcmd.JCmd.executeCommandForPid(JCmd.java:115) at jdk.jcmd/sun.tools.jcmd.JCmd.main(JCmd.java:99) -On a different note: when looking at the Docker logs of the ``crate`` +On a different note: When looking at the Docker logs of the ``crate`` container, you can see that when trying to run the ``jcmd`` command, the CrateDB instance logs a full thread dump. -What is the problem then? ----------------------------- +.. rubric:: Root Cause The entrypoint_ of the ``crate`` Docker image ensures that the CrateDB Java process runs as user ``crate``, since **CrateDB must be run as a non-root @@ -111,12 +98,15 @@ with PID ``1`` and the crate command would be a child-process with a different PID. This is not what one wants in a Docker container, where the application must (?) run as PID 1. + +.. rubric:: Solution + With that knowledge in mind, you can use ``chroot`` to execute the ``jcmd`` command as well. .. code-block:: console - [root@b768001196ce data]# chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 VM.version + [root@cratedb data]# chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 VM.version 1: OpenJDK 64-Bit Server VM version 13.0.1+9 JDK 13.0.1 @@ -126,7 +116,7 @@ for troubleshooting CrateDB inside the Docker container. .. code-block:: console - [root@b768001196ce data]# chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 help + [root@cratedb data]# chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 help 1: The following commands are available: Compiler.CodeHeap_Analytics @@ -181,18 +171,18 @@ for troubleshooting CrateDB inside the Docker container. To execute one of these commands from outside of the Docker container without explicitly attaching to it, you can combine the ``docker exec`` command with the -``jcmd`` command. This would look like so: +``jcmd`` command. .. code-block:: console $ docker exec -ti /bin/bash -c "chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 " For example, running ``GC.heap_info`` on Docker container with ID -``b768001196ce``: +``cratedb``. .. code-block:: console - $ docker exec -ti b768001196ce /bin/bash -c "chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 GC.heap_info" + $ docker exec -ti cratedb /bin/bash -c "chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 GC.heap_info" 1: garbage-first heap total 524288K, used 129716K [0x00000000e0000000, 0x0000000100000000) region size 1024K, 126 young (129024K), 22 survivors (22528K) @@ -203,35 +193,20 @@ For example, running ``GC.heap_info`` on Docker container with ID Troubleshooting Commands ======================== - -Thread Dump ------------ - -:Command: ``jcmd Thread.print`` - - -Example -....... - -.. code-block:: console - - $ docker exec -ti b768001196ce /bin/bash -c "chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 Thread.print" - 1: - ... - +These are the most common troubleshooting tasks, but of course there are many +more possibilities to get diagnostic information using the ``jcmd`` command. +You can find more information about the utility at the `jcmd documentation`_. Heap Info --------- :Command: ``jcmd GC.heap_info`` - -Example -....... +.. rubric:: Example .. code-block:: console - $ docker exec -ti b768001196ce /bin/bash -c "chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 GC.heap_info" + $ docker exec -ti cratedb /bin/bash -c "chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 GC.heap_info" 1: ... @@ -241,13 +216,11 @@ Heap Dump :Command: ``jcmd GC.heap_dump `` - -Example -....... +.. rubric:: Example .. code-block:: console - $ docker exec -ti b768001196ce /bin/bash -c "chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 GC.heap_dump /data/crate.hprof" + $ docker exec -ti cratedb /bin/bash -c "chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 GC.heap_dump /data/crate.hprof" 1: Heap dump file created @@ -258,18 +231,32 @@ Example is not "blown up". -Java Flight Recording ---------------------- +Thread Dump +----------- -:Command: ``jcmd JFR.start name= duration= filename= settings=profile`` +:Command: ``jcmd Thread.print`` + +.. rubric:: Example + +.. code-block:: console + + $ docker exec -ti cratedb /bin/bash -c "chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 Thread.print" + 1: + ... + + +.. _jfr-docker: +Java Flight Recorder (JFR) +-------------------------- -Example -....... +:Command: ``jcmd JFR.start name= duration= filename= settings=profile`` + +.. rubric:: Example .. code-block:: console - $ docker exec -ti b768001196ce /bin/bash -c "chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 JFR.start name=recording1 duration=60s filename=/data/recording1.jfr" + $ docker exec -ti cratedb /bin/bash -c "chroot --userspec=1000 / /crate/jdk/bin/jcmd 1 JFR.start name=recording1 duration=60s filename=/data/recording1.jfr" 1: Started recording 1. The result will be written to: @@ -281,10 +268,6 @@ Example access the created jfr dump from ouside of the container and the container is not "blown up". -These are the most common troubleshooting tasks, but of course there are many -more possibilities to get diagnostic information using the ``jcmd`` command. -You can find more information about the utility at the `jcmd documentation`_. - .. _entrypoint: https://github.com/crate/docker-crate/blob/master/docker-entrypoint.sh -.. _jcmd documentation: https://docs.oracle.com/javase/8/docs/technotes/guides/troubleshoot/tooldescr006.html#BABEHABG +.. _jcmd documentation: https://docs.oracle.com/en/java/javase/17/docs/specs/man/jcmd.html diff --git a/docs/admin/troubleshooting/jcmd/index.md b/docs/admin/troubleshooting/jcmd/index.md new file mode 100644 index 00000000..975f2871 --- /dev/null +++ b/docs/admin/troubleshooting/jcmd/index.md @@ -0,0 +1,134 @@ +(jcmd)= + +# Using `jcmd` with CrateDB + +:::{rubric} Introduction +::: +Since JDK 8, `jcmd` is the designated successor of different tools used before +(`jstack`, `jinfo`, `jmap`). It can be used to perform various diagnostic tasks +on a running Java application, such as performing a heap dump, thread dump, and +so on. + +> The [jcmd] utility is used to send diagnostic command requests to the JVM, +where these requests are useful for controlling Java Flight Recordings, +troubleshoot, and diagnose JVM and Java Applications. It must be used on +the same machine where the JVM is running, and have the same effective user +and group identifiers that were used to launch the JVM. + +:::{code} console +$ /crate/jdk/bin/jcmd -h +Usage: jcmd + or: jcmd -l + or: jcmd -h + + command must be a valid jcmd command for the selected jvm. + Use the command "help" to see which commands are available. + If the pid is 0, commands will be sent to all Java processes. + The main class argument will be used to match (either partially + or fully) the class used to start Java. + If no options are given, lists Java processes (same as -l). + + PerfCounter.print display the counters exposed by this process + -f read and execute commands from the file + -l list JVM processes on the local machine + -? -h --help print this help message +::: + + +:::{rubric} Help +::: +`jcmd help` lists all available commands that you can use to troubleshoot +CrateDB. +:::{code} console +/crate/jdk/bin/jcmd 1 help +::: +For more information about a specific command, use `help `. + + +:::::{grid} 1 1 2 2 +:padding: 0 + +::::{grid-item} +:class: rubric-slimmer + +:::{rubric} Troubleshooting Commands +::: +:::{code} console +# Java Version +/crate/jdk/bin/jcmd 1 VM.version + 1: + OpenJDK 64-Bit Server VM version 13.0.1+9 + JDK 13.0.1 + +# Heap Information +/crate/jdk/bin/jcmd 1 GC.heap_info + +# Heap Dump +/crate/jdk/bin/jcmd 1 GC.heap_dump /data/crate.hprof + +# Thread Dump +/crate/jdk/bin/jcmd 1 Thread.print +::: +:::: + +::::{grid-item} +:class: rubric-slimmer + +:::{rubric} Java Flight Recorder (JFR) +::: +:::{code} console +/crate/jdk/bin/jcmd 1 JFR.start duration=60s filename=/data/recording1.jfr +/crate/jdk/bin/jcmd 1 JFR.start duration=300s filename=/data/recording2.jfr settings=profile +::: +:::{card} {material-outlined}`receipt_long;2em` Java Flight Recorder (JFR) +:link: jfr +:link-type: ref +A monitoring tool that collects information about the events in a Java Virtual +Machine (JVM) during the execution of a Java application. JFR is part of the +JDK distribution, and it is integrated into the JVM. +::: +:::: + +::::: + + + +## Learn + +:::{rubric} Fundamentals +::: + +:::{card} {material-outlined}`receipt_long;1.6em` Diagnosing a Running JVM +:link: https://www.baeldung.com/running-jvm-diagnose#7-jfr-command-options +This tutorial examines the jcmd utility, its commands, command options, and +and how to use it. +::: + +:::{card} {material-outlined}`library_books;1.6em` Java SE Diagnostic Tools » The jcmd Utility +:link: https://docs.oracle.com/en/java/javase/11/troubleshoot/diagnostic-tools.html#GUID-42A18B29-B4AD-4831-B846-2CDBA55F2254 +The official documentation about jcmd outlines its advantages, useful commands, and +troubleshooting guidelines. +::: + +:::{rubric} CrateDB +::: + +:::{card} {octicon}`container;1.6em` Use jcmd with CrateDB on Docker +:link: jcmd-docker +:link-type: ref +Learn why the standard way to run `jcmd` does not work when running CrateDB +inside a container, for example when using Docker, and how to resolve that +problem. +::: + + + +:::{toctree} +:hidden: + +docker +jfr +::: + + +[jcmd]: https://docs.oracle.com/en/java/javase/17/docs/specs/man/jcmd.html diff --git a/docs/admin/troubleshooting/jcmd/jfr.md b/docs/admin/troubleshooting/jcmd/jfr.md new file mode 100644 index 00000000..fd8ba2ee --- /dev/null +++ b/docs/admin/troubleshooting/jcmd/jfr.md @@ -0,0 +1,89 @@ +(jfr)= +# Java Flight Recorder (JFR) + +:::{rubric} About +::: +The jcmd utility is the traditional application to inquire diagnostic information +from applications running on the JVM, also including the [Java Flight Recorder] (JFR). + +:::{rubric} Details +::: +The Java Flight Recorder (JFR) is a profiling and event collection framework to +gather detailed low-level information about how the Java Virtual Machine (JVM) +and Java applications are behaving during execution. + +Flight recordings can be started when the application is started or while the +application is running. The data is recorded as time-stamped data points called +events. + +JFR is part of the JDK distribution, and it is integrated into the JVM. + +:Events: + + JFR collects events that occur in the JVM when the Java application runs. The + events are related to the state of the JVM itself or the state of the program. + An event has a name, a timestamp, and additional information, like thread + information, execution stack, and state of the heap. + +:Recording Types: + + A _time fixed recording_, also known as a _profiling recording_, runs for a set + amount of time, and then stops. A _continuous recording_ is a recording that is + always on and saves, for example, the last six hours of data into a circular + buffer, discarding old data when the buffer fills up. + +:Performance: + + The events that JFR collects contain a huge amount of data. For this reason, + JFR is designed to affect the performance of a running application as little + as possible. + + JFR saves data about the events into a single output file. + Because disk I/O operations are expensive, JFR uses various buffers to store + collected data before flushing blocks of data to disk. + + +## Synopsis +:::{code} console +jcmd 1 JFR.start duration=60s filename=/data/recording1.jfr +jcmd 1 JFR.start duration=300s filename=/data/recording2.jfr settings=profile +::: + + +## Learn + +:::{rubric} Fundamentals +::: + +:::{card} {material-outlined}`article;1.6em` Java Flight Recorder » Basic Concepts and Usage +:link: https://www.baeldung.com/java-flight-recorder-monitoring#java-flight-recorder +This tutorial examines Java Flight Recorder, its concepts, its basic commands, and how to use it. +::: + +:::{card} {material-outlined}`receipt_long;1.6em` Diagnose a Running JVM » JFR Command Options +:link: https://www.baeldung.com/running-jvm-diagnose#7-jfr-command-options +A concise example how to generate a JFR file using the jcmd command. +::: + +:::{card} {material-outlined}`library_books;1.6em` Java SE Diagnostic Tools » Flight Recorder +:link: https://docs.oracle.com/en/java/javase/11/troubleshoot/diagnostic-tools.html#GUID-D38849B6-61C7-4ED6-A395-EA4BC32A9FD6 +The official documentation about JFR outlines its advantages, its event types grouped +by recording templates, and its types of recordings. It also describes in detail how +to produce flight recordings, what's inside, and how to analyze them. +::: + +:::{rubric} CrateDB +::: + +:::{card} {octicon}`container;1.6em` Use JFR with CrateDB on Docker +:link: jfr-docker +:link-type: ref +Learn why the standard way to run `jcmd` does not work when running CrateDB +inside a container, for example when using Docker, and how to resolve that +problem. +::: + + + +[Java Flight Recorder]: https://en.wikipedia.org/wiki/JDK_Flight_Recorder +[jcmd]: https://docs.oracle.com/en/java/javase/17/docs/specs/man/jcmd.html diff --git a/docs/admin/troubleshooting/system-tables.rst b/docs/admin/troubleshooting/system-tables.rst index 7ef7ff2a..fa7b0c69 100644 --- a/docs/admin/troubleshooting/system-tables.rst +++ b/docs/admin/troubleshooting/system-tables.rst @@ -1,14 +1,15 @@ .. highlight:: psql .. _systables: -================================== -Troubleshooting with system tables -================================== +============================== +Diagnostics with System Tables +============================== CrateDB maintains a set of diagnostic tables in the **sys** schema. It currently consists of ten tables that provide an overview of the cluster state. -If something is going wrong and you initially don't know why, they help you to -analyze, identify the problem and start mitigating it. While there is + +If something is going wrong, and you initially don't know why, they help you to +analyze, identify the problem, and start mitigating it. While there is :ref:`detailed information about all system tables `, this guide runs you through the most common situations. @@ -18,8 +19,8 @@ this guide runs you through the most common situations. :local: -Step 1: Health check -==================== +Step 1: Inspect health checks +============================= A good point to start is the table **sys.check** that maintains a number of health checks. You may know it from the admin UI. Order them by severity:: @@ -32,20 +33,20 @@ health checks. You may know it from the admin UI. Order them by severity:: SELECT ... in set (... sec) If a check fails, the description offers some explanation on how to proceed. -The table reports checks that verify your cluster layout, give recommendations -for configuration options, and warn you on incompatible software versions. More -will be added as you go. +This synthetic table reports checks that verify your cluster layout, gives recommendations +for configuration options, and warns you on incompatible software versions. More +checks will be added as we go. -Step 2: Activity in the cluster -=============================== +Step 2: Check cluster activity +============================== Statements that are currently executed on the server are tracked in the tables **sys.jobs** and **sys.operations**. They give you the opportunity to view the ongoing activity in the cluster. If you're using an earlier version than CrateDB 3.0.0, you will have to enable -statistics using: +statistics using:: cr> SET GLOBAL stats.enabled = true; SET OK, 1 row affected ( … sec) @@ -81,10 +82,10 @@ recorded history of finished jobs and operations in the tables **sys.jobs_log** and **sys.operations_log**, respectively. -Step 3: Analyzing cluster resources -=================================== +Step 3: Analyze cluster resources +================================= -Sometimes it's not a single query that causes problems, but a component of your +Sometimes it is not a single query that causes problems, but a component of your distributed cluster. To find out more about it, check the table **sys.cluster**, which holds a single row containing the name and ID of the current master along with several other settings. To list all available data, @@ -111,7 +112,7 @@ overloaded or because they have an outdated Java version:: +-------+--------...+------------------------...+ SELECT ... in set (... sec) -To list all nodes using more than 98 per cent of the memory, type:: +To list all nodes using more than 98 per cent of system memory, invoke:: cr> SELECT * FROM sys.nodes WHERE mem['used_percent'] > 98; +--...+---...+------...-+-...+---...+--...+---...+------...+-...+------...+---...+-----...-+-------...+----------...-+------...+ @@ -120,8 +121,11 @@ To list all nodes using more than 98 per cent of the memory, type:: ... SELECT ... in set (... sec) -The table also contains the performance metrics like the load average, disk, -memory, heap, or network throughput. Running:: +The table also contains performance metrics like the load average, disk, +memory, heap, or network throughput. +The object has the same structure as the **_node** system column of +**sys.operations** from the previous section. +This query lists all available attributes:: cr> SHOW columns IN nodes FROM sys; +-------------------------------------------------...+-----------...+ @@ -131,8 +135,6 @@ memory, heap, or network throughput. Running:: +-------------------------------------------------...+-----------...+ SHOW ... rows in set (... sec) -lists all available attributes. This object has the same structure as the -**_node** system column of **sys.operations** from the previous section. Step 4: Insights about partitions, shards, and replication @@ -140,11 +142,13 @@ Step 4: Insights about partitions, shards, and replication CrateDB divides the rows of each table into shards that are distinctively distributed to all nodes in your cluster. Replication uses the same mechanism -to add redundancy and thus resilience to your data. While most of the time +to add redundancy and thus resilience to your data. + +While most of the time CrateDB transparently takes care of distributing and replicating the shards, -it's useful during troubleshooting to actually find out some more about these +it is useful in troubleshooting situations to learn more about these data structures. The **sys.shards** table provides access to the status and -size of shards, their names and IDs:: +size of shards, their names, and IDs:: cr> SHOW COLUMNS IN shards FROM sys; +--------------------------------+-----------+ @@ -191,9 +195,10 @@ on the admin UI evaluate these values as well. The **sys.shards** table contains even more information about the rebalancing activities. Sometimes CrateDB needs to transfer a shard to another node, since that may be necessary to ensure there are enough replicas of it distributed in -the cluster. You can estimate the progress of that operation with the -**recovery** object. To monitor the progress of the shard transfer, run this -query:: +the cluster. + +You can estimate the progress of that operation with the **recovery** object. +Run this query to monitor the progress of the shard transfer:: cr> select _node['hostname'], id, recovery['stage'], recovery['size']['percent'], routing_state, state from sys.shards ... where routing_state in ('RELOCATING','INITIALIZING') order by id; @@ -215,7 +220,9 @@ until the transfer is done. After that, the source row is deleted from **sys.shards** automatically. To find out on which specific node a shard is stored, also use the object in -the **_node** system column that is available for this table. As an example:: +the **_node** system column that is available for this table. For example, +this query lists the hosts and tables with the highest number of rows inside +a single shard:: cr> SELECT _node['hostname'], table_name, num_docs FROM sys.shards ORDER BY num_docs DESC LIMIT 3; +-------------------...+-----------...-+----------+ @@ -225,9 +232,6 @@ the **_node** system column that is available for this table. As an example:: +-------------------...+------------...+----------+ SELECT ... in set (... sec) -This query lists the hosts and tables with the highest number of rows inside a -single shard. - .. SEEALSO:: :ref:`Bulk import: Shards and replicas ` @@ -245,17 +249,17 @@ apparent reason. You would probably want to find out what is causing the cluster to not allocate the shards. For that, there is the ``sys.allocations`` table, which lists all shards in the cluster. -If a shard is unassigned, the row will also include a reason why it cannot be -allocated on any node. +- If a shard is unassigned, the row will also include a reason why it cannot be + allocated on any node. -If a shard is assigned but cannot be moved or rebalanced, the row includes a -reason why it remains on the current node. +- If a shard is assigned but cannot be moved or rebalanced, the row includes a + reason why it remains on the current node. -For a full list of available columns, see the :ref:`reference documentation -about the sys.allocations table `. +- For a full list of available columns, see the :ref:`reference documentation + about the sys.allocations table `. -To find out about the different states of shards of a specific table, you can -simply filter by ``table_schema`` and ``table_name``, e.g.:: +- To find out about the different states of shards of a specific table, you can + simply filter by ``table_schema`` and ``table_name``, e.g.:: cr> SELECT table_name, shard_id, node_id, explanations ... FROM sys.allocations @@ -271,13 +275,15 @@ simply filter by ``table_schema`` and ``table_name``, e.g.:: SELECT ... in set (... sec) -Step 6: Managing snapshots -========================== +Step 6: Manage snapshots +======================== Finally: if your repair efforts did not succeed, and your application or users accidentally deleted some data, recover one of the previously taken snapshots of your cluster. The tables **sys.snapshots** and **sys.repositories** assist -you in managing your backups. Remember, one or more backups are stored in +you in managing your backups. + +Remember, one or more backups are stored in repositories outside the CrateDB cluster initialized with the **CREATE REPOSITORY** request. An actual copy of a current database state is made with the **CREATE SNAPSHOT** command. If you forgot where you store your snapshots:: @@ -290,7 +296,8 @@ the **CREATE SNAPSHOT** command. If you forgot where you store your snapshots:: SELECT ... in set (... sec) might come in handy. To actually recover data, first determine which snapshot -to restore. Suppose you make nightly backups, the command:: +to restore. Suppose you make nightly backups, this command displays last week's +snapshots along with their name, the stored indices, and how long they took:: cr> SELECT * FROM sys.snapshots ORDER BY started DESC LIMIT 7; +------------------+----------+------+------------+---------+-------+---------+ @@ -298,6 +305,3 @@ to restore. Suppose you make nightly backups, the command:: +------------------+----------+------+------------+---------+-------+---------+ +------------------+----------+------+------------+---------+-------+---------+ SELECT ... in set (... sec) - -shows you last week's snapshots along with their name, the stored indices, and -how long they took. diff --git a/docs/conf.py b/docs/conf.py index 00233151..bf2e2a0a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -27,10 +27,15 @@ r"https://stackoverflow.com/.*", # Expired certificate. r"https://tldp.org/LDP/Linux-Filesystem-Hierarchy/html/index.html", + # 403 Client Error: Forbidden for url + r"https://www.baeldung.com/.*", + # 404 Client Error: Not Found + r"https://github.com/crate-workbench/cratedb-toolkit/actions/runs/.*", ] # Configure intersphinx. intersphinx_mapping.update({ + 'ctk': ('https://cratedb-toolkit.readthedocs.io/', None), 'matplotlib': ('https://matplotlib.org/stable/', None), 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), 'numpy': ('https://numpy.org/doc/stable/', None),