From 087c9d076aeefe49eef632cb46e7e8fecbe238f4 Mon Sep 17 00:00:00 2001 From: Michael Herstine Date: Sat, 19 Feb 2022 15:55:21 -0800 Subject: [PATCH] Commit an RFC for this plugin. --- .../VisualizationOutputPlugin.cxx | 250 ++++++++++++++++++ .../VisualizationOutputPlugin.hxx | 136 ++++++++++ 2 files changed, 386 insertions(+) create mode 100644 src/output/plugins/visualization/VisualizationOutputPlugin.cxx create mode 100644 src/output/plugins/visualization/VisualizationOutputPlugin.hxx diff --git a/src/output/plugins/visualization/VisualizationOutputPlugin.cxx b/src/output/plugins/visualization/VisualizationOutputPlugin.cxx new file mode 100644 index 0000000000..028af58517 --- /dev/null +++ b/src/output/plugins/visualization/VisualizationOutputPlugin.cxx @@ -0,0 +1,250 @@ +/* + * Copyright 2003-2022 The Music Player Daemon Project + * http://www.musicpd.org + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/** + * \page vis_out_protocol Visualization Network Protocol + * + * See \ref vis_out "RFC: Visualizatoin Output Plugin" for background. + * + * + * \section vis_out_protocol_timing Timing + * + * In order to deliver sound data to the client at the proper time, the protocol + * needs to take into account: + * + * - network latency: the delta between writing the sound data to the socket & its + * receipt on the client + * + * - player buffering: the player may buffer sound data (mplayer, for instance, + * buffers half a second's worth of audio before beginning playback) + * + * - render time: the client presumably wishes the current frame to appear on-screen + * at the moment the current sound information is ending + * + * Throughout, let \e t be "song time" be measured on the server, and T(t) be + * sound information for song time \e t. Let FPS be the frames-per-second at + * which the client would like to render. + * + * Then, at an interval of 1/FPS seconds, the server needs to write + * + \verbatim + T(t - {buffer time} + {render time} + {one way latency}) + \endverbatim + * + * to the client socket. If we denote that time value by tau, then the server + * should wait min(0, -tau) ms to write the first frame. + * + * A few examples will illustrate. + * + * \subsection vis_out_protocol_timing_eg_1 Example 1 + * + * Let the client render time be 4ms and round-trip network latency be + * 6ms. Assume no player buffering. In order to render a frame corresponding to + * song time \e t, the client would need, at time \e t - 4 ms, sound information + * corresponding to time \e t, or T(t). The server would need to \e send that + * information at time \e t - 7ms (half of one round-trip plus render time). + * + * In other words, on the server side at song time \e t, we would need to write + * T(t + 7ms) to the client socket. If the server writes T(t+7ms) immediately, + * the client will receive it at \e t + 4ms, take 4ms to render the next frame, + * and so at \e t + 7ms hence, finish rendering T(t+7). + * + * \subsection vis_out_protocol_timing_eg_2 Example 2 + * + * Imagine we are running the same client on a laptop, connected to an MPD + * server over the internet, and using mplayer as the player. This gives 500ms + * of buffer time. Let us assume the same 4ms render time, but now a 20ms + * round-trip time. + * + * In order to render a frame corresponding to song time \e t, the client would + * need, at time \e t - 4ms, T(t). This would need to be sent from the server at + * time \e t - 14ms. We now need to incorporate the client-side buffering, + * however. Song time \e t will be actually played on the client at \e t + 500ms + * on the server. + * + * In other words, on the server side at song time \e t, we would need to write + * T(t-486ms) to the client socket. + * + * Since the sound won't start on the client for 0.5 sec, it would make no sense + * to begin writing sound information for 486ms. Let t(0) be the moment the + * client connects and the player begins buffering. If, at t(0) + 486ms, the + * server writes T(t(0)), the client will receive it at t(0) + 496ms & complete + * rendering it at t(0) + 500ms, which is when the client-side player will + * begin playing song time t(0). + * + * \section vis_out_protocol_proto The Protocol + * + * \subsection vis_out_protocol_proto_design Design + * + * The author is unaware of any existing network protocols in this area, so he + * designed his own after reveiwing the Shoutcast & Ultravox + * protocols. Experience with the TLS & 802.11 protocols also informed this + * design. + * + * Design goals include: + * + * - client convenience + * - this in particular drove the choice to stream updates; everything + * needed to simply push the data out is knowable at handshake time, + * so why force the client to send a request? + * - efficiency on the wire + * - binary format + * - streaming preferred over request/response + * - future extensibility + * - protocol versioning built-in from the start + * - parsing convenience + * - streaming messages come with a few "magic bytes" at the start + * to assist clients in "locking on" to the stream & recovering from + * corrupted data + * - all messages conform to the "type-length-value" (TLV) format + * beloved of parser writers + * + * + * \subsection vis_out_protocol_proto_overview Overview + * + * The protocol is a combination of request/response as well as streaming. After + * an initial handshake (client goes first) the server will begin streaming + * messages to the client; i.e. at the interval the client specified during the + * initial handshake the server will send FRAME messages containing sound + * information useful for visualizers. Additionally, METADATA messages will be + * sent on receipt of notifications from MPD that the song has changed. The + * client need not request these messages or does the client need to acknowledge + * them in any way. + * + * The client \e may, at any time after handshake completion, initiate two other + * exchanges: + * + * - PING: the client may want to periodically adjust it's estimate of the + * round-trip client-side latency; the server will respond with a PONG. + * The client can measure the delta between request & response & update + * its estimate. + * + * - ADJBUF: the client may periodically want to adjust the "buffer time"; + * that is, the delta between song time as measured on the server and + * the song time to each FRAME & METADATA frame corresponds; the server + * will adjust it's internal timers & respond with an ADJBUFACK message. + * The server \e may send one more frame at the old setting of tau. + * + * Schematically, a conversation looks like this: + * + \verbatim + Client Server + + desired protocol version + tau (buffer offset) + desired sound params --------- CLIHLO ---------> + damping parameter + window parameter + ... + + <-------- SRVHLO --------- offered protocol version + | + sequence number --------- PING ----------> | + <-------- PONG ---------- sequence number | tau ms + | + | + <------- METADATA-------- artist, title &c v + <-------- FRAME --------- samples, spectrum | + bass/mids/trebs | + ... | tau ms + | + <------- METADATA-------- artist, title &c v + <-------- FRAME --------- samples, spectrum | + bass/mids/trebs | + ... | + sequence number --------- PING ----------> | + <-------- PONG ---------- sequence number | tau ms + | + | + <------- METADATA-------- artist, title &c v + <-------- FRAME --------- samples, spectrum | + bass/mids/trebs | + ... | tau ms + | + <------- METADATA-------- artist, title &c v + <-------- FRAME --------- samples, spectrum | + bass/mids/trebs | + ... | tau ms + | + tau' (new buffer -------- ADJBUF --------> | + offset) <------ ADJBUFACK ------- | + | + <------- METADATA-------- artist, title &c v + <-------- FRAME --------- samples, spectrum | + bass/mids/trebs | + ... | tau' ms + | + <------- METADATA-------- artist, title &c v + <-------- FRAME --------- samples, spectrum + bass/mids/trebs + ... + .... + (forever) + \endverbatim + * + * There is no formal "close" or "teardown" message; each side simply detects + * when the other has gone away & treats that as the end of the conversation. + * + * + * \subsection vis_out_protocol_proto_msgs Messages + * + * All messages: + * + * - integers use network byte order (i.e. big endian) + * - use TLV format (streaming messages prepend magic bytes) + * + \verbatim + + +-----------------------+-----------------+-----------------------+--------+ + | TYPE (16-bit unsigned)| LENGTH | PAYLOAD | CHECK | + | class | message type | 16-bits unsigned| LENGTH bytes | 1 byte | + |-------+---------------|-----------------|-----------------------+--------+ + | 4 bits| 12 bits | (max len 65535) | format is msg-specfic | 00 | + +-----------------------+-----------------+-----------------------+--------+ + + \endverbatim + * + * Notes: + * + * - the message type is comprised of two values packed into a u16_t: + * + * - class: (type & f000) >> 12: + * - 0: handshake + * - 1; control (PING, e.g.) + * - 2: streaming (FRAME, e.g.) + * + * - message type: (type & 0ffff) see below for values + * + * - the "length" field is the length of the \e payload \e only + * + * - the "check" byte is intended as a sanity test & shall always be zero + * TODO(sp1ff): replace this with a proper checksum? + * + * TODO(sp1ff): define each message + * + * + */ + + +const struct AudioOutputPlugin visualization_output_plugin = { + "visualization", + nullptr, // cannot serve as the default output + nullptr, // TODO(sp1ff): Write me! + nullptr, // no particular mixer +}; diff --git a/src/output/plugins/visualization/VisualizationOutputPlugin.hxx b/src/output/plugins/visualization/VisualizationOutputPlugin.hxx new file mode 100644 index 0000000000..832d549e45 --- /dev/null +++ b/src/output/plugins/visualization/VisualizationOutputPlugin.hxx @@ -0,0 +1,136 @@ +/* + * Copyright 2003-2022 The Music Player Daemon Project + * http://www.musicpd.org + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef MPD_VISUALIZATION_OUTPUT_PLUGIN_HXX +#define MPD_VISUALIZATION_OUTPUT_PLUGIN_HXX + +/** + * \page vis_out RFC: Visualiation Output Plugin + * + * \section vis_out_intro Introduction + * + * This plugin started from a conversation on the #mpd IRC channel. I asked + * about the best way to implement a music visualizer as a remote MPD client. All the current MPD visualizers of + * which I'm aware use the fifo output + * plugin and hence must be run on the same host as the MPD daemon. + * + * The response I got was a suggestion that I write an output plugin that would + * \e just stream the data needed to implement a remote visualizer. I've begun + * work on such a plugin, but before I spend too much time implementing it I + * would like to lay out my proposal & solicit feedback. + * + * The codebase uses PR + * proposed solving this problem by implementing an output plugin that would + * stream the raw PCM data over TCP, the idea being that the remote visualizer + * would do the sound analysis client-side. The PR was discarded as being + * redundant with the httpd + * output plugin. I would also observe that such a solution sends far more + * data on the wire than is needed for visualization (on which more below). + * + * - ncmpcpp uses the fifo + * output plugin, and as such can only provide the visualization feature when + * it's being run locally. The sound analysis is limited, as well (see below) + * + * - cli-visualizer will + * work with the MPD fifo (again assuming the MPD daemon is running + * locally). Limited sound analysis, as well. + * + * - MilkDrop: + * reading the source code was very instructive in terms of sound analysis for + * music visualization; that aspect of this proposal is largely based on it. + * + * + * \section vis_out_proposal The Proposal + * + * A new output plugin "visualization" will be implemented. The plugin will + * cache recent PCM data. The plugin will also be a ServerSocket. When clients + * connect, they will provide the details of the sound analysis they would like + * performed, the frame rate at which they would like to receive updates and the + * offset between client-side render time & server-side song time (to account + * for network lag, client-side buffering & the time needed to render each + * frame). Once that initial handshake is complete, the server will stream + * updates containing sound analysis results at regular intervals to the + * client. + * + * \subsection vis_output_proposal_analysis Sound Analysis + * + * Given audio data in raw PCM format, a number of steps may be taken to + * analyze that data & produce infromation useful to visualizers: + * + * - the PCM data may optionally be damped by taking a weighted average between + * the current values & prior values in the time domain; this will have the + * effect of reducing noise in the higher frequency ranges + * + * - the PCM data may have a "window function" applied to it in the time domain + * around the time of interest; such a function has the effect of "dialing + * down" audio samples further from the timestamp of interest and again will + * reduce higher-frequency noise; the size of the window may be configured to + * incorporate more or less data as desired. + * + * - the resulting PCM data will be shifted into the frequency domain by + * application of the Discrete Fourier Transform + * + * - the human ear can only distinguish frequence from (about) 200Hz to 20000Hz, + * and in practice musical sound information doesn't show much activity above + * 10000Hz; it is therefore convenient to throw out frequency data outside + * some (client-configurable) range + * + * - it is also convenient to divide the resulting spectrum into a few coarse + * bands, such as bass/mids/trebs. This is computationally non-trivial because + * perceptually, frequency is not linear, it's logrithmic. A change of one + * octave corresponds to a doubling in frequency. Intuitively, this means that + * the difference betwenn 200 & 300Hz is much greater than the difference + * betwen 5000 & 5100Hz, e.g. The plugin will peform this service for + * each client. + * + * - it can also be useful to maintain a weighted time average of the activity + * in each frequency range for purposes of beat detection + * + * + * \subsection vis_output_protocol The Protocol + * + * Note that each update need only be based on relatively few samples (Winamp, + * e.g. uses 576). This will keep the data transferred on the wire small (at + * least by comparison to, say, the httpd output plugin which of course needs to + * send the entire song). Casting the protocol in terms of client-side FPS + * allows us to avoid a "request/response" protocol & simply stream until the + * client goes away. + * + * I've broken out the detailed protocol specification into its own + * \ref vis_out_protocol "page". + * + * + */ + +extern const struct AudioOutputPlugin visualization_output_plugin; + +#endif