diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 241285bc8c..df7f342fe1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -88,7 +88,8 @@ jobs: libupnp-dev \ libsqlite3-dev \ libchromaprint-dev \ - libgcrypt20-dev + libgcrypt20-dev \ + libfftw3-dev - name: Full Build uses: BSFishy/meson-build@v1.0.3 @@ -152,7 +153,8 @@ jobs: libvorbis \ faad2 \ wavpack \ - libmpdclient + libmpdclient \ + fftw - name: Build uses: BSFishy/meson-build@v1.0.3 @@ -193,6 +195,7 @@ jobs: dbus:p faad2:p ffmpeg:p + fftw:p fmt:p flac:p gtest:p diff --git a/meson_options.txt b/meson_options.txt index f887cb170a..5911d6f5af 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -186,6 +186,8 @@ option('shout', type: 'feature', description: 'Shoutcast streaming support using option('snapcast', type: 'boolean', value: true, description: 'Snapcast output plugin') option('sndio', type: 'feature', description: 'sndio output plugin') option('solaris_output', type: 'feature', description: 'Solaris /dev/audio support') +option('visualization', type: 'boolean', value: true, description: 'Visualization output plugin') +option('fftw3', type: 'feature', description: 'FFTW support') # # Misc libraries diff --git a/src/output/Registry.cxx b/src/output/Registry.cxx index 3dacd3e5ed..c17233daf9 100644 --- a/src/output/Registry.cxx +++ b/src/output/Registry.cxx @@ -23,6 +23,7 @@ #include "plugins/ShoutOutputPlugin.hxx" #include "plugins/sles/SlesOutputPlugin.hxx" #include "plugins/SolarisOutputPlugin.hxx" +#include "plugins/visualization/VisualizationOutputPlugin.hxx" #ifdef ENABLE_WINMM_OUTPUT #include "plugins/WinmmOutputPlugin.hxx" #endif @@ -89,6 +90,9 @@ constinit const AudioOutputPlugin *const audio_output_plugins[] = { #endif #ifdef ENABLE_WASAPI_OUTPUT &wasapi_output_plugin, +#endif +#ifdef ENABLE_VISUALIZATION_OUTPUT + &visualization_output_plugin, #endif nullptr }; diff --git a/src/output/plugins/meson.build b/src/output/plugins/meson.build index c790a52c09..c1f1e9c674 100644 --- a/src/output/plugins/meson.build +++ b/src/output/plugins/meson.build @@ -160,6 +160,24 @@ else wasapi_dep = dependency('', required: false) endif +libfftw3_dep = dependency('fftw3f', version: '>= 3.3.10', required: get_option('fftw3')) +output_features.set('ENABLE_FFTW3', libfftw3_dep.found()) +output_features.set('ENABLE_VISUALIZATION_OUTPUT', get_option('visualization')) +if get_option('visualization') + if not libfftw3_dep.found() + error('libfftw3 not available, but is required for the visualization plugin') + endif + output_plugins_sources += [ + 'visualization/VisualizationOutputPlugin.cxx', + 'visualization/SoundAnalysis.cxx', + 'visualization/SoundInfoCache.cxx', + 'visualization/VisualizationServer.cxx', + 'visualization/VisualizationClient.cxx', + 'visualization/Protocol.cxx', + ] + output_plugins_deps += [ event_dep, net_dep, libfftw3_dep ] +endif + output_plugins = static_library( 'output_plugins', output_plugins_sources, diff --git a/src/output/plugins/visualization/LowLevelProtocol.hxx b/src/output/plugins/visualization/LowLevelProtocol.hxx new file mode 100644 index 0000000000..f48f62f5f5 --- /dev/null +++ b/src/output/plugins/visualization/LowLevelProtocol.hxx @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright The Music Player Daemon Project + +#ifndef LOW_LEVEL_PROTOCOL_HXX_INCLUDED +#define LOW_LEVEL_PROTOCOL_HXX_INCLUDED + +#include "util/PackedBigEndian.hxx" + +#include + +#include +#include +#include + +namespace Visualization { + +/* Write a uint16_t to an output iterator over byte in wire format; return the + * iterator in its new position + */ +template +OutIter +SerializeU16(uint16_t n, OutIter pout) { + auto m = PackedBE16(n); + auto p = (std::byte*)(&m); + return std::copy(p, p + 2, pout); +} + +static_assert(std::numeric_limits::is_iec559); + +/* Convert an IEEE 754 single-precision floating-point number to wire format; + * write it to an output iterator & return the iterator in its new position + */ +template +OutIter +SerializeFloat(float f, OutIter pout) { + auto m = PackedBE32(*(uint32_t*)&f); + auto p = (std::byte*)(&m); + return std::copy(p, p + 4, pout); +} + +/* Convert an fftwf_complex to wire format; write it to an output iterator & + * return the iterator in its new position + */ +template +OutIter +SerializeComplex(const fftwf_complex c, OutIter pout) { + auto r = PackedBE32(*(const uint32_t*)&(c[0])); + auto i = PackedBE32(*(const uint32_t*)&(c[1])); + auto pr = (std::byte*)(&r); + auto pi = (std::byte*)(&i); + pout = std::copy(pr, pr + 4, pout); + return std::copy(pi, pi + 4, pout); +} + +} // namespace Visualization + +#endif // LOW_LEVEL_PROTOCOL_HXX_INCLUDED diff --git a/src/output/plugins/visualization/Protocol.cxx b/src/output/plugins/visualization/Protocol.cxx new file mode 100644 index 0000000000..94a76219d3 --- /dev/null +++ b/src/output/plugins/visualization/Protocol.cxx @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright The Music Player Daemon Project + +#include "Protocol.hxx" + +#include "Log.hxx" +#include "util/ByteOrder.hxx" +#include "util/Domain.hxx" + +Visualization::ParseResult +Visualization::ParseClihlo(void *data, + size_t length, + ClientHello &clihlo) noexcept { + // CLIHLO payload is 6 bytes, header & footer are five more. + if (length < sizeof(ClientHello) + 5) { + return ParseResult::NEED_MORE_DATA; + } + + uint8_t *buf = (uint8_t *)data; + + uint16_t msg_type = FromBE16(*(uint16_t *)buf); + if (msg_type != 0) { + return ParseResult::ERROR; + } + + buf += 2; + uint16_t payload_len = FromBE16(*(uint16_t *)buf); + if (payload_len != 6) { + return ParseResult::ERROR; + } + + buf += 2; + clihlo.major_version = *buf++; + clihlo.minor_version = *buf++; + + clihlo.requested_fps = FromBE16(*(uint16_t *)(buf)); + buf += 2; + clihlo.tau = FromBE16(*(int16_t *)(buf)); + buf += 2; + + if (*buf != 0) { + return ParseResult::ERROR; + } + + return ParseResult::OK; +} diff --git a/src/output/plugins/visualization/Protocol.hxx b/src/output/plugins/visualization/Protocol.hxx new file mode 100644 index 0000000000..a2cdc1c399 --- /dev/null +++ b/src/output/plugins/visualization/Protocol.hxx @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright The Music Player Daemon Project + +#ifndef VISUALIZATION_PROTOCOL_HXX_INCLUDED +#define VISUALIZATION_PROTOCOL_HXX_INCLUDED + +#include "LowLevelProtocol.hxx" +#include "SoundAnalysis.hxx" + +#include +#include + +namespace Visualization { + +/** + * \brief A parsed CLIHLO message + * + * \sa ParseCliHlo + * + * + * The visualization \ref vis_out_protocol "protocol" begins with the client + * connecting to the server & providing certain paramters of the sound analysis + * it would like to receive. That is done through the CLIHLO message (which see + * \a ref vis_out_protocol_proto_clihlo "here"). + * + * See \a vis_out_protocol_timing "timing" for details on parameter tau. + * + * + */ + +struct ClientHello { + /// Major protocol version the client would like to speak + uint8_t major_version; + /// Minor protocol version the client would like to speak + uint8_t minor_version; + /// The number of sound analyses per second the client would like to + /// receive (presumably the rate at which it is rendering frames, hence + /// the name "fps") + uint16_t requested_fps; + /// The desired offset (named "tau" in the documentation) between song + /// time and analysis time at each analysis performed + int16_t tau; +}; + +enum class ParseResult { + OK, + NEED_MORE_DATA, + ERROR, +}; + +/** + * \brief Attempt to parse a \ref vis_out_protocol_proto_clihlo "CLIHLO" message + * from the given buffer + * + * \param buf [in] An array of octets potentially containing the message + * + * \param length [in] The length of \a buf, in octets + * + * \param clihlo [out] A reference to a `client_hello_t` structure to be + * filled-in on successful execution + * + * \return ParseResult::OK if the message was successfully parsed, + * NEED_MORE_DATA if the message is incomplete, or ERROR if the message cannot + * be ready from \a buf + * + * + * CLIHLO is the first message in the protocol, sent by the client. See + * \ref vis_out_protocol_proto_clihlo "the protocol specification" for details, + * and \ref vis_out_protocol "Visualization Network Protocol" for discussion + * of the protocol altogether. + * + * + */ + +ParseResult +ParseClihlo(void *buf, size_t length, ClientHello &clihlo) noexcept; + +/// Serialize an SRVHLO message to wire format +template +void +SerializeSrvhlo(std::byte major_ver, std::byte minor_ver, OutIter pout) { + using std::byte; + + *pout++ = byte{0}; // + *pout++ = byte{1}; // message type + *pout++ = byte{0}; // + *pout++ = byte{2}; // payload length + *pout++ = major_ver; + *pout++ = minor_ver; + *pout++ = byte{0}; // check byte +} + +/// Serialize a FRAME message header to wire format +template +OutIter +SerializeSoundInfoFrameHeader(uint8_t num_chan, + size_t num_samp, + size_t num_freq, + OutIter pout) { + + using std::byte; + + // Start with the "magic number" allowing clients to "lock on" to the + // stream of sound info frames in the event of an error. + // See \ref vis_out_protocol_proto_msgs for details. + *pout++ = byte{0x63}; + *pout++ = byte{0xac}; + *pout++ = byte{0x84}; + *pout++ = byte{0x03}; + + *pout++ = byte{16}; + *pout++ = byte{0}; + + return SerializeU16(17 + 4 * num_chan * (num_samp + 3 * num_freq + 3), + pout); +} + +/// Serialize a FRAME message payload to wire format +template +void +SerializeSoundInfoFrameFooter(OutIter pout) { + *pout = std::byte{0x00}; +} + +/// Serialize a FRAME message to wire format +template +void +SerializeSoundInfoFrame(const Visualization::SoundAnalysis &a, + OutIter pout) { + pout = SerializeSoundInfoFrameHeader(a.NumChan(), a.NumSamp(), + a.NumFreq(), pout); + pout = a.SerializeSoundInfoFramePayload(pout); + SerializeSoundInfoFrameFooter(pout); +} + +} // namespace Visualization. + +#endif // VISUALIZATION_PROTOCOL_HXX_INCLUDED diff --git a/src/output/plugins/visualization/SoundAnalysis.cxx b/src/output/plugins/visualization/SoundAnalysis.cxx new file mode 100644 index 0000000000..27c27acb57 --- /dev/null +++ b/src/output/plugins/visualization/SoundAnalysis.cxx @@ -0,0 +1,421 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright The Music Player Daemon Project + +#include "SoundAnalysis.hxx" + +#include "Log.hxx" +#include "config/Block.hxx" +#include "lib/fmt/AudioFormatFormatter.hxx" +#include "lib/fmt/RuntimeError.hxx" +#include "pcm/FloatConvert.hxx" +#include "util/Domain.hxx" + +#include + +const Domain d_sound_analysis("sound_analysis"); + +Visualization::SoundAnalysisParameters::SoundAnalysisParameters() noexcept +: SoundAnalysisParameters(DEFAULT_NUM_SAMPLES, DEFAULT_LO_CUTOFF, DEFAULT_HI_CUTOFF) +{ } + +Visualization::SoundAnalysisParameters::SoundAnalysisParameters( + const ConfigBlock &config_block) +: SoundAnalysisParameters( + config_block.GetPositiveValue("num_samples", DEFAULT_NUM_SAMPLES), + config_block.GetPositiveValue("lo_cutoff", DEFAULT_LO_CUTOFF), + config_block.GetPositiveValue("hi_cutoff", DEFAULT_HI_CUTOFF)) +{ } + +Visualization::SoundAnalysisParameters::SoundAnalysisParameters( + size_t num_samples_in, + float lo_cutoff_in, + float hi_cutoff_in): + num_samples(num_samples_in), + lo_cutoff(lo_cutoff_in), hi_cutoff(hi_cutoff_in) +{ + if (lo_cutoff >= hi_cutoff) { + throw FmtRuntimeError( + "lo_cutoff ({}) must be less than hi_cutoff ({})", + lo_cutoff, hi_cutoff); + } +} + +/** + * \page vis_out_dft The Discrete Fourier Transform & Frequency Analysis + * + * \section vis_out_dft_intro Introduction + * + * This page contains some notes on the Discrete Fourier Transform as applied to + * music. They are a combination of dimly-remembered mathematics from + * university, source code comments from the milkdrop Winamp visualization + * plug-in, and the fftw documentation. + * + * \section vis_out_dft_basics The Basics + * + * The first thing to note is that the human ear can perceive sounds in the + * range 200 - 20,000Hz. For visualization purposes, implementations tend to + * throw away frequency data above 10,000Hz or so since there's not much + * activity there (something I've observed myself). + * + * Perceptually, frequency is not linear, it's logarithmic. A change of one + * octave corresponds to a doubling in frequency. Intuitively, this means that + * the difference between, say, 200 & 300Hz is \em much greater than the + * difference betwen 5000 & 5100Hz, for example. + * + * \subsection vis_out_dft_dft The Discrete Fourier Transform + * + * Given \c n audio samples, sampled at a frquency of \c F Hz, the DFT computes + * \c n complex numbers, each of which corresponds to the frequency: + * + \code + k * F + freq = -----, k=0,...,n-1 + k n + + \endcode + * + * (see + * here). + * + * The DFT library I'm using (fftw AKA "The + * Fastest Fourier Transofrm in the West") takes advantage of the Hermitian + * property of the Fourier Transform of real data in which the k-th Fourier + * coefficient is the complex conjugate of the the (n-k)-th coefficient and ony + * returns the first n/2+1 Fourier coefficients (i.e. indicies 0 to n/2, + * inclusive) to save time & space. See + * here. + * + * Therefore, the first Fourier coefficient returned corresponds to 0Hz, and the + * last to: + * + \code + n + - * F + 2 F + freq = ----- = - + n 2 + \endcode + * + * or half the sampling frequency. + * + * \subsection vis_out_dft_buckets How To Bucket Frequencies + * + * To divide frequency data into \c N bands (whether for a scheme like bass/ + * mids/trebs, or into a number of bars for visualization purposes), consider + * your frequency range & compute the number of octaves therein. If we let \c n + * be the number of octaves, then we know: + * + \code + n freq_hi log(freq_hi/freq_lo) + 2 := ------- => n = ------------------- + freq_lo log(2) + \encode + * + * The \c N bands will then be: + * + \code + n/N + freq_lo...freq_lo * 2 + + n/N 2*n/N + freq_lo * 2 ...freq_lo * 2 + + ... + (N-1)*n/N n + freq_lo * 2 ... freq_lo * 2 + + \endcode + * + * \subsection vis_out_dft_eg Worked Example + * + * Let the number of samples n be 576. This means our dft will return n/2 + 1 = + * 289 complex numbers. Let our sampling frequency F be 44,100Hz. For each k, + * k=0...289, the corresponding frequency will be k * 44100/289, giving us a + * frequency range of 0Hz to 22,500Hz. Let's clamp that to 200-11,000, compute + * the power spectrum, and divide that power up into three bands: bass, mids & + * trebs. + * + * First, we need to find the indicies into the dft corresponding to our + * desired frequency range. + * + \code + f * n + k * F k + f = ----- ==> k = ------, where f := the frequency of the k-th + k n F k Fourier coefficient + + + | 200 * 576 | + k0 = | --------- | = floor(2.61...) = 2 + | 44100 | + - - + + - - + | 11000 * 576 | + k1 = | ----------- | = ceil(143.67...) = 144 + | 44100 | + + \endcode + * + * So the power spectrum will have 144 - 2 + 1 = 143 elements in it. Nb. we're + * throwing away roughly the upper-half of our frequency spectrum. + * + * To divide these frequencies into three bands such that each band contains the + * same number of octaves, we compute how many octaves there are in our + * frequency range (call this \c n): + * + \code + + n 11000 log(11000/200) + 2 = ----- => n = -------------- = 5.7814 + 200 log(2) + \endcode + * + * In other words, there are 5.7814 octaves in our chosen frequency range. We + * want to give each band 5.7814/3 = 1.9271 octaves. That means the three + * "buckets" will be: + * + \code + 1.9271 + 200 ........... 200 * 2 or 200 - 761Hz + + 1.9271 2*1.9271 + 200 * 2 ....... 200 * 2 or 761 - 2,892Hz + + 2*1.9271 5.7814 + 200 * 2 ....... 200 * 2 or 2,892 - 11,000Hz + + \endcode + * + * + */ + +Visualization::SoundAnalysis::SoundAnalysis( + const SoundAnalysisParameters ¶ms, + std::shared_ptr pc) +: num_samples(params.GetNumSamples()), + out_samples((num_samples / 2) + 1), + pcache(pc), + audio_format(pc->GetFormat()), + num_channels(audio_format.channels), + cbbuf(params.GetNumSamples() * audio_format.GetFrameSize()), + buf(new std::byte[cbbuf]), + in(fftwf_alloc_real(num_samples * num_channels), fftwf_free), + out(fftwf_alloc_complex(out_samples * num_channels), fftwf_free), + bass_mids_trebs(new float[3 * num_channels]) +{ + if (num_samples > INT_MAX) { + throw FmtInvalidArgument( + "num_samples({}) may not be larger than {}", + num_samples, INT_MAX); + } + + int n[] = { (int)num_samples }; + + /* The input is assumed to be interleaved; this seems convenient from + * the perspective of how it's stored from the AudioOutput... tho if we + * need an additional copy to convert it to `float`, we'd have the + * opportunity to re-arrange it. */ + + int dist = num_samples; + + /* Per the FFTW docs: + * + * "`rank` is the rank of the transform (it should be the size of the + * array `*n`) we use the term rank to denote the number of independent + * indices in an array. For example, we say that a 2d transform has rank + * 2, a 3d transform has rank 3, and so on." + * + * This is always 1, for us. + * + * layout of `in`: + * + * | 0 ... num_samples-1 | num_samples ... 2*num_samples-1 | 2*num_samples ... + * | data for chan 0 | data for chan 1 | data for chan 2 */ + + /* `howmany` is the number of transforms to compute. The resulting plan + * computes `howmany` transforms, where the input of the k-th transform + * is at location in+k*idist (in C pointer arithmetic), and its output + * is at location out+k*odist. */ + + int odist = (num_samples / 2) + 1; + + plan = fftwf_plan_many_dft_r2c(1, // rank of the input array-- we have one-dimensional arrays + n, // the number of elements in each array + num_channels, // one array for each channel + in.get(), // input buffer-- need to copy samples here before executing + NULL, + 1, // input stride + dist, // distance between successive arrays (indexes, not bytes) + out.get(), // output buffer-- overwritten on each execution + NULL, + 1, // output stride + odist, // distance between successive arrays (indexes, not bytes) + FFTW_ESTIMATE);// should probably be zero (to select a more exhaustive + // search), but out of an abundance of caution, tell + // FFTW to pick a plan quickly + if (NULL == plan) { + throw FmtRuntimeError("Failed to generate an FFTW plan: " + "num_samp={},num_chan={}", + num_samples, num_channels); + } + + freq_lo = params.GetLoCutoff(); + + float samples_per_sec = (float) audio_format.GetSampleRate(); + float ns = (float) num_samples; + // The highest frequency we can represent will be + float max_freq = (ns - 1.0f) * samples_per_sec / ns; + if (max_freq < params.GetHiCutoff()) { + FmtWarning(d_sound_analysis, + "Clamping max frequency from {} to {}", + freq_hi, max_freq); + freq_hi = max_freq; + } else { + freq_hi = params.GetHiCutoff(); + } + + idx_lo = (size_t)floorf(freq_lo * + (float) num_samples / samples_per_sec ); + idx_hi = (size_t) ceilf(freq_hi * (float)num_samples / samples_per_sec); + + float num_octaves = logf(freq_hi/freq_lo) / 0.69314718f; + + float freq_mids = freq_lo * powf(2.0f, num_octaves / 3.0f); + float freq_trebs = freq_lo * powf(2.0f, 2.0f * num_octaves / 3.0f); + + idx_mids = ns * freq_mids / samples_per_sec; + idx_trebs = ns * freq_trebs / samples_per_sec; +} + +bool +Visualization::SoundAnalysis::Update(SoundInfoCache::Time t) noexcept +{ + FmtDebug(d_sound_analysis, "SoundAnalysis::update(tid: {}), time {}us, " + "# samp: {}, buffer size: {}", gettid(), + duration_cast(t.time_since_epoch()).count(), + num_samples, pcache->Size()); + + if (!pcache->GetByTime(num_samples, t, buf.get(), cbbuf)) { + FmtWarning(d_sound_analysis, "Failed to get samples by time " + "for sound analysis ({} samples requested, at " + "time {}us for buf size {}).", num_samples, + duration_cast(t.time_since_epoch()).count(), + cbbuf); + return false; + } + + /* Copy the raw PCM data from `buf` into `in`. I hate this, but we need + * to convert the input data from `uint16_t` (or whatever) to `float` + * regardless. We could, of course, do the conversion when the PCM data + * is added to the cache, but since I anticipate processing far fewer + * samples than I store, I expect this to be more efficient (both in + * terms of time & space). + * + * Since we have to do the copy anyway, let's convert from interleaved + * to sequential (i.e. all samples for the first channel laid-out + * contiguously, followed by all from the second, and so forth). */ + typedef IntegerToFloatSampleConvert S8Cvt; + typedef IntegerToFloatSampleConvert S16Cvt; + typedef IntegerToFloatSampleConvert S32Cvt; + typedef IntegerToFloatSampleConvert S24P32; + + for (size_t i = 0; i < num_samples; ++i) { + for (size_t j = 0; j < num_channels; ++j) { + /* `buf` index: i * num_channels + j + * `in` index: j * num_samples + i */ + float x; + switch (audio_format.format) { + case SampleFormat::S8: + x = S8Cvt::Convert( + *(int8_t*)buf[i * num_channels + j]); + break; + case SampleFormat::S16: + x = S16Cvt::Convert( + *(int16_t*) (buf.get() + + 2 * (i*num_channels + j))); + break; + case SampleFormat::S32: + x = S32Cvt::Convert( + *(int32_t*)(buf.get() + + 4 * (i*num_channels + j))); + break; + case SampleFormat::FLOAT: + x = *(float*)(buf.get() + + 4 * (i * num_channels + j)); + break; + case SampleFormat::S24_P32: + /* signed 24 bit integer samples, packed in 32 + * bit integers (the most significant byte is + * filled with the sign bit) */ + x = S24P32::Convert( + *(int32_t *)(buf.get() + + 4 * (i*num_channels + j))); + break; + default: + assert(false); + } + in.get()[j * num_samples + i] = x; + } + } + + fftwf_execute(plan); + + size_t max_coeffs_idx = num_samples/2; + + for (unsigned c = 0; c < num_channels; ++c) { + + bass_mids_trebs[3 * c] = bass_mids_trebs[3 * c + 1] = + bass_mids_trebs[3*c+2] = 0.0f; + + // walk [idx_lo, idx_hi) + for (size_t i = idx_lo; i < idx_hi; ++i) { + size_t j = i; + if (j > max_coeffs_idx) { + j = num_samples - j; + } + fftwf_complex *pout = + out.get() + c * (max_coeffs_idx + 1); + float contrib = sqrt( + pout[j][0]*pout[j][0] + pout[j][1]*pout[j][1]); + if (i < idx_mids) { + bass_mids_trebs[3*c] += contrib; + } else if (i < idx_trebs) { + bass_mids_trebs[3*c + 1] += contrib; + } else { + bass_mids_trebs[3*c + 2] += contrib; + } + } + } + + return true; +} + +bool +Visualization::SoundAnalysis::GetCoeffs(fftwf_complex *coeffs, + size_t num_complex) const noexcept { + if (num_complex < out_samples * num_channels) { + return false; + } + + /* Would prefer to use `std::copy`, but fftw regrettably defines + * `fftwf_complex` as `float[2]` which confuses it. */ + mempcpy(coeffs, out.get(), + out_samples * num_channels * sizeof(fftwf_complex)); + + return true; +} + +bool +Visualization::SoundAnalysis::GetBassMidsTrebs(float *buf_out, + size_t num_buf) const { + + if (num_buf < 3 * num_channels) { + return false; + } + + std::copy(bass_mids_trebs.get(), + bass_mids_trebs.get() + 3 * num_channels, + buf_out); + return true; +} diff --git a/src/output/plugins/visualization/SoundAnalysis.hxx b/src/output/plugins/visualization/SoundAnalysis.hxx new file mode 100644 index 0000000000..92db0d600c --- /dev/null +++ b/src/output/plugins/visualization/SoundAnalysis.hxx @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright The Music Player Daemon Project + +#ifndef SOUND_ANALYSIS_HXX_INCLUDED +#define SOUND_ANALYSIS_HXX_INCLUDED 1 + +#include "SoundInfoCache.hxx" +#include "LowLevelProtocol.hxx" + +#include + +#include +#include +#include + +#include + +struct ConfigBlock; + +namespace Visualization { + +/** + * \brief Convenience class for expressing sound analysis parameters exclusive + * of the audio format + * + * + * There are any number of parameters governing our analysis of PCM data. Other + * than the `AudioFormat`, they are read from configuration at startup and + * constant. Rather than force callers to write methods taking many parameters, + * this class colects them all in one place, and enforces some constraints on + * their values. + * + * + */ + +class SoundAnalysisParameters +{ + /* The number of samples used for each analysis; this must be greater + * than zero and needn't be large (say, less than 1024). Configuration + * value "num_samples" */ + size_t num_samples; + /* Data lower than this frequency (in the frequency domain) shall be + * discarded; must be greater than or equal to zero, and less than + * hi_cutoff. A typical value would be 200 (the lower range of human + * perception). Units: Hz. Configuration value "lo_cutoff" */ + float lo_cutoff; + /* Data greater than this frequency (in the frequency domain) shall be + * discarded; must be greater than or equal to zero, and greater than + * lo_cutoff. A typical value would be 10000-12000 (empirically, there's + * not a lot of activity above 10000 in song data). Units + * Hz. Configuration value "hi_cutoff" */ + float hi_cutoff; + + static constexpr size_t DEFAULT_NUM_SAMPLES = 513; + static constexpr size_t DEFAULT_LO_CUTOFF = 200; + static constexpr size_t DEFAULT_HI_CUTOFF = 10000; + +public: + SoundAnalysisParameters() noexcept; + explicit SoundAnalysisParameters(const ConfigBlock &config_block); + SoundAnalysisParameters(size_t num_samples, float lo_cutoff, + float hi_cutoff); + + size_t + GetNumSamples() const noexcept { + return num_samples; + } + float + GetLoCutoff() const noexcept { + return lo_cutoff; + } + float + GetHiCutoff() const noexcept { + return hi_cutoff; + } +}; + +/** + * \class SoundAnalysis + * + * \brief Analayze PCM data in a manner convienient for visualization authors + * + * + * This class houses our logic for going from raw PCM data to the power + * spectrum, bass/mids/trebs &c. Instances are constructed with configuration + * information on the analysis details, and repeated analysis for different + * datasets is performed via update(). Since instances allocate input & output + * buffers for the discrete Fourier transform, they are not meant to be copied + * around. + * + * + */ + +class SoundAnalysis { + + /// # of samples to be used in each analysis + size_t num_samples; + /// # of Fourier coefficients computed by FFTW (should be (num_samples / + /// # 2) + 1) + size_t out_samples; + std::shared_ptr pcache; + AudioFormat audio_format; + /* # of audio channels (e.g. 1 is mono, 2 is stereo-- + * # cf. SampleFormat.hxx); should be audio_format.num_channels */ + uint8_t num_channels; + /// Size of `buf`, in bytes + size_t cbbuf; + /// Pre-allocated buffer for raw PCM data + std::unique_ptr buf; + /// Input array for all FFTs performed by this `SoundAnalysis` instance + std::unique_ptr in; + /// Output array for all FFTs performed by this `SoundAnalysis` instance + std::unique_ptr out; + /* Pre-computed (by fftw) information on the fastest way to compute the + * Discrete Fourier Transform on the underlying hardware */ + fftwf_plan plan; + /* Frequency cutoffs, in Hz; we'll return frequencies in the range + [freq_lo, freq_hi] */ + float freq_lo, freq_hi; + /* Indicies into `out` corresponding to the desired frequency range; + * that range is indexed by [index_lo, index_hi) */ + size_t idx_lo, idx_hi; + /// Indicies into `out` corresponding "mids" & "trebs" + size_t idx_mids, idx_trebs; + /* Bass/mids/trebs, laid-out as [bass, mids, trebs, bass, mids, trebs] + * (i.e. 3 * num_channels floats) */ + std::unique_ptr bass_mids_trebs; + +public: + SoundAnalysis(const SoundAnalysisParameters ¶ms, + std::shared_ptr pc); + + uint8_t + NumChan() const noexcept { + return num_channels; + } + /// Return the number of audio samples, per channel, used in each analysis + size_t + NumSamp() const noexcept { + return num_samples; + } + /* Return the number of Fourier coefficients & power spectrum values + * returned, per channel; this is determined by the number of samples and + * the frequency cutoffs */ + size_t + NumFreq() const noexcept { + return idx_hi - idx_lo; + } + + /// Update the current analysis to be current as of time \a t + bool Update(SoundInfoCache::Time t) noexcept; + + /* Return the first half of the Fourier coefficients (bearing in mind + * that the Hermitian property means we only need to deal with the first + * nsamp/2 + 1) with no frequency cutoffs. Mostly used for testing */ + bool GetCoeffs(fftwf_complex *coeffs, + size_t num_complex) const noexcept; + bool GetBassMidsTrebs(float *buf, size_t num_buf) const; + + ///////////////////////////////////////////////////////////////////////// + // Serialization Support // + ///////////////////////////////////////////////////////////////////////// + + /* Write the waveforms used in the current analysis to \a pout; return + * the updated iterator. The waveforms will be written as per the + * \ref vis_out_protocol_proto_frame "protocol spec". + */ + template + OutIter + SerializeWaveforms(OutIter pout) const { + const float *pin = in.get(); + for (size_t j = 0; j < num_channels; ++j) { + for (size_t i = 0; i < num_samples; ++i) { + pout = SerializeFloat(pin[j * num_samples + i], + pout); + } + } + return pout; + } + + /* Write the frequency coefficients that resulted from the current analysis + * subject to frequency cutoffs to \a pout; return the updated + * iterator. The coefficients will be written as per the + * \ref vis_out_protocol_proto_frame "protocol spec". */ + template + OutIter + SerializeCoefficients(OutIter pout) const { + return TransformCoeffs(pout, SerializeComplex); + } + + /* Write the magnitude of a complex number (presumably a Fourier + * coefficient) to \a pout; return the updated iterator. The magnitude will + * be written as per the \ref vis_out_protocol_proto_frame "protocol spec". */ + template + static + OutIter + SerializeSpectrum(const fftwf_complex c, OutIter pout) { + return SerializeFloat(sqrt(c[0] * c[0] + c[1] * c[1]), pout); + } + + /* Write the power spectrum that resulted from the current analysis to \a + * pout; return the updated iterator. The power spectrum will be written as + * per the \ref vis_out_protocol_proto_frame "protocol spec". */ + template + OutIter + SerializePowerSpectra(OutIter pout) const { + return TransformCoeffs(pout, SerializeSpectrum); + } + + /* Write the bass/mids/trebs values that resulted from the current analysis + * to \a pout; return the updated iterator. The values will be written as + * per the \ref vis_out_protocol_proto_frame "protocol spec". */ + template + OutIter + SerializeBassMidsTrebs(OutIter pout) const { + float *bmt = bass_mids_trebs.get(); + for (size_t i = 0; i < num_channels; ++i) { + pout = SerializeFloat(bmt[3 * i], pout); + pout = SerializeFloat(bmt[3 * i + 1], pout); + pout = SerializeFloat(bmt[3 * i + 2], pout); + } + return pout; + } + + /* Write the payload of a \c FRAME message to \a pout; return the updated + * iterator. The payload will be written as per the + * \ref vis_out_protocol_proto_frame "protocol spec". */ + template + OutIter + SerializeSoundInfoFramePayload(OutIter pout) const { + pout = SerializeU16(num_samples, pout); + *pout++ = (std::byte) num_channels; + pout = SerializeU16(audio_format.GetSampleRate(), pout); + pout = SerializeWaveforms(pout); + pout = SerializeU16(NumFreq(), pout); + pout = SerializeFloat(freq_lo, pout); + pout = SerializeFloat(freq_hi, pout); + pout = SerializeU16(idx_lo, pout); + pout = SerializeCoefficients(pout); + pout = SerializePowerSpectra(pout); + pout = SerializeBassMidsTrebs(pout); + return pout; + } + + /* Write the Fourier coefficients in the range `[idx_lo, idx_hi)` to + * \a pout first transforming them by \a op. */ + template + OutIter + TransformCoeffs( + OutIter pout, + OutIter (*op)(const fftwf_complex, OutIter pout)) const { + + /* We wish to serialize the Fourier cofficients [idx_lo, + * idx_hi), transformed by `op`. The issue is that `out` stores + * the coefficients [0, num_samples/2 + 1), so we need to + * tranform the indexing operation. */ + const fftwf_complex *po = out.get(); + + // The # of frequencies stored in `out` per channel + size_t total_freq_per_chan = num_samples / 2 + 1; + + // The maximum indexable frequency per channel + size_t upper_freq_per_chan = + std::min(idx_hi, total_freq_per_chan); + + /* In both `out` & `pout`, the coefficients are laid out as: | + * coeffs for chan #0... | coeffs for chan #1... | ... | + * so outer loop will be on channel. */ + for (unsigned chan = 0; chan < num_channels; ++chan) { + + /* This is the index into `out` of the very first + * Fourier coefficient for this channel. */ + size_t first_freq_this_chan = + chan * total_freq_per_chan; + /* Beginning from here, we wan to walk the indicies: + * [idx_lo, upper_freq_per_chan) + * This will take us from the "low" frequency index up + * to num_samp/2 + 1 or idx_hi, whichever is least. */ + for (size_t i = first_freq_this_chan + idx_lo; + i < first_freq_this_chan + upper_freq_per_chan; + ++i) { + pout = op(po[i], pout); + } + /* *If* idx_hi is greater than num_samp/2+1, walk back + * *down* the Fourier coefficients (taking advantiage of + * the Hermetian property) */ + if (idx_hi > total_freq_per_chan) { + for (size_t i = + first_freq_this_chan + idx_hi - 1; + i >= first_freq_this_chan + + total_freq_per_chan; + --i) { + fftwf_complex c = { + po[num_samples - i][0], + -po[num_samples-i][1] }; + pout = op(c, pout); + } + } + } + return pout; + } + +}; + +} // namespace Visualization + +#endif // SOUND_ANALYSIS_HXX_INCLUDED diff --git a/src/output/plugins/visualization/SoundInfoCache.cxx b/src/output/plugins/visualization/SoundInfoCache.cxx new file mode 100644 index 0000000000..8c8cedd720 --- /dev/null +++ b/src/output/plugins/visualization/SoundInfoCache.cxx @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright The Music Player Daemon Project + +#include "SoundInfoCache.hxx" + +#include "Log.hxx" +#include "util/Domain.hxx" + +#include +#include + +using namespace Visualization; +using namespace std::chrono; + +const Domain d_sound_info_cache("vis_sound_info_cache"); + +inline +typename std::chrono::microseconds::rep +NowTicks() { + return duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); +} + +Visualization::SoundInfoCache::SoundInfoCache(const AudioFormat &audio_format, + const Duration &buf_span): + fmt(audio_format), + secs_per_frame(1. / double(fmt.GetSampleRate())), + frame_size(audio_format.GetFrameSize()), + ring(fmt.TimeToSize(buf_span)), + cb(0), + p0(0), + p1(0) +{ } + +/** + * \brief Add \a size bytes of PCM data to the cache; \a data is assumed to be + * PCM data in our audio format + * + * + * \param data [in] Address of a buffer containing PCM samples to be added to the cache + * + * \param size [in] Size of \a data, in bytes + * + * + * This method will add \a data to the end of the cache, overwriting earlier + * data if necessary. + * + * Nb. regarding the corner case where \a size is larger than the cache itself: + * in this event, the implementation will simply write as much of \a data into + * the cache as possible, discarding both the first portion of \a data as well + * as the previous contents of the cache. + * + * + */ + +void +Visualization::SoundInfoCache::Add(const void *data, size_t size) +{ + FmtDebug(d_sound_info_cache, "[{}] SoundInfoCache::add(tid:{}," + "bytes:{})", NowTicks(), gettid(), size); + + std::lock_guard guard(mutex); + + if (t0.time_since_epoch().count() == 0) { + t0 = system_clock::now(); + t1 = t0; + } + + size_t cb_ring = ring.size(); + if (size > cb_ring) { + /* Special case: we can't fit this chunk into the ring buffer; + just take the last `cb_ring` bytes & discard everything + earlier. */ + size_t lost = size - cb_ring; + mempcpy(ring.data(), (const uint8_t*)data + lost, cb_ring); + cb = cb_ring; + p0 = p1 = 0; + t1 += fmt.SizeToTime(size); + t0 = t1 - fmt.SizeToTime(cb_ring); + } else { + /* Happy path: `size` is <= `cb_ring`. We can fit it all, but + may overwrite old data. */ + size_t part1 = + std::min(size, cb_ring - p1); // #bytes written at p1 + size_t part2 = size - part1; // #bytes "wrapped around" + + memcpy(ring.data() + p1, data, part1); + memcpy(ring.data(), (const uint8_t*)data + part1, part2); + + p1 = (p1 + size) % cb_ring; + + // # bytes overwritten at start/p0 + size_t part3; + if (cb == cb_ring) { + part3 = size; + } else { + part3 = part2 > (size_t) p0 ? part2 - p0 : 0; + } + + p0 = (p0 + part3) % cb_ring; + cb = cb + size - part3; + + t0 += fmt.SizeToTime(part3); + t1 += fmt.SizeToTime(size); + } +} + +// This is primarily used for testing purposes. +bool +Visualization::SoundInfoCache::GetFromBeginning(size_t nsamp, + void *buf, + size_t cbbuf) const +{ + std::lock_guard guard(mutex); + + size_t cbsamp = nsamp * frame_size; + if (cbsamp > cbbuf) { + return false; + } + + size_t part1 = std::min(cbsamp, ring.size() - p0); + size_t part2 = cbsamp - part1; + memcpy(buf, ring.data() + p0, part1); + memcpy((uint8_t*)buf + part1, ring.data(), part2); + + return true; +} + +/** + * \brief Retrieve \a nsamp PCM samples ending at time \a t; copy them into + * \a buf; will return false if this cannot be done for any reason + * + * + * \param nsamp [in] the number of PCM samples desired by the caller; this + * corresponds to an AudioFormat "frame": IOW each sample is made up of multiple + * channels of PCM data + * + * \param t [in] the time at which the sampling shall \e end + * + * \param buf [in] a caller-supplied buffer to which, on success, \a nsamp + * audio frames will be copied + * + * \param cbbuf [in] the size, in bytes, of the buffer at \a buf + * + * \return true on success, false on failure + * + * + * This method will copy \a nsamp audio samples ending at time \a t into + * \a buf. If \a t does not exactly correspond to an audio sample, it will be + * adjusted by the implementation to correspond to the next whole sample. + * + * + */ + +bool +Visualization::SoundInfoCache::GetByTime(size_t nsamp, Time t, + void *buf, size_t cbbuf) const +{ + using std::min; + + FmtDebug(d_sound_info_cache, "[{}] SoundInfoCache::get_by_time" + "(tid:{},t:{}us, delta:{}us)", NowTicks(), gettid(), + duration_cast(t.time_since_epoch()).count(), + duration_cast(t1 - t).count()); + + std::lock_guard guard(mutex); + + size_t cbsamp = nsamp * frame_size; + if (cbsamp > cbbuf) { + /* Can't fit the requested number of frames/samples into `buf`-- + fail. */ + FmtWarning(d_sound_info_cache, + "[{}] SoundInfoCache::get_by_time: can't fit {} " + "samples into {} bytes", NowTicks(), nsamp, cbbuf); + return false; + } + + if (t > t1) { + FmtWarning(d_sound_info_cache, + "[{}] SoundInfoCache::get_by_time: time t {}us is " + "greater than time t1 {}us-- failing.", + NowTicks(), + duration_cast(t.time_since_epoch()).count(), + duration_cast(t1.time_since_epoch()).count()); + return false; + } + + /* Determine which frame `t` falls into. If `t - t0` is a perfect + multiple of the time-per-frame, use the last frame. + + I need the duration in `t-t0` to be in seconds, but in seconds, but + with the fractional part. */ + double delta_t = double(duration_cast(t - t0).count()) / 1000000.; + ptrdiff_t pb = + p0 + ptrdiff_t(ceil(delta_t / secs_per_frame)) * frame_size; + + // Make sure we have enough samples in [t0, t) to satisfy this request. + size_t cb_in_buf = size_t(ceil(delta_t / secs_per_frame)) * frame_size; + if (cbsamp > cb_in_buf) { + FmtWarning(d_sound_info_cache, + "[{}] SoundInfoCache::get_by_time: the requested " + "number of samples take up {} bytes, but we only " + "have {} bytes in the buffer.", + NowTicks(), cbsamp, cb_in_buf); + return false; + } + + size_t cb_ring = ring.size(); + ptrdiff_t pa = pb - nsamp * frame_size; + pb = pb % cb_ring; + pa = pa % cb_ring; + + /* So we want to copy offsets [pa, pb) % cb_ring :=> buf. "part1" + denotes the range from `pa` to the end of the buffer, and "part2" + that from the start of the buffer to `pb`. */ + size_t part1 = min(cbsamp, cb_ring - pa); + size_t part2 = cbsamp - part1; + memcpy(buf, ring.data() + pa, part1); + memcpy((uint8_t*)buf + part1, ring.data(), part2); + + return true; +} + +/// Return true IFF the ring buffer is empty +bool +Visualization::SoundInfoCache::Empty() const { + std::lock_guard guard(mutex); + return 0 == Size(); +} + +std::pair +Visualization::SoundInfoCache::Range() const +{ + std::lock_guard guard(mutex); + return std::make_pair(t0, t1); +} + +std::size_t +Visualization::SoundInfoCache::Size() const +{ + std::lock_guard guard(mutex); + return cb; +} diff --git a/src/output/plugins/visualization/SoundInfoCache.hxx b/src/output/plugins/visualization/SoundInfoCache.hxx new file mode 100644 index 0000000000..9d25964116 --- /dev/null +++ b/src/output/plugins/visualization/SoundInfoCache.hxx @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright The Music Player Daemon Project + +#ifndef SOUND_INFO_CACHE_HXX_INCLUDED +#define SOUND_INFO_CACHE_HXX_INCLUDED + +#include "output/Timer.hxx" +#include "pcm/AudioFormat.hxx" +#include "thread/Mutex.hxx" +#include "util/AllocatedArray.hxx" + +#include +#include + +namespace Visualization { + +/** + * \brief Thread-safe cache for recent PCM data + * + * + * Class SoundInfoCache maintains a ring buffer (AKA circular buffer) for PCM + * data to cap the amount of of memory used. It keeps two pointers into that + * buffer: the beginning and the end of valid data, along with the timestamps + * corresponding to each. + * + * The general contract is that once the ctor returns, the caller has an + * instance with an empty ring buffer & that is ready to accept data. Time + * starts from the first invocation of add(). Successive invocations of add() + * are assumed to represent contiguous ranges of sound data (i.e. there is no + * way to represent gaps). + * + * Instances may have their methods invoked by multiple threads, so any method + * invocation will block on acquiring a Mutex. I had initially considered a + * single-writer, multi-reader lock in the interests of allowing many + * simultaneous reads, but in practice it would not be an improvement, since + * there is only one reader & one writer, and the writer, empirically, is the + * more frequent caller. + * + * A circular buffer is surprisingly difficult to write. I considered + * abstracting this implementation into a general purpose library class, but + * there are a number of implementation-specific choices arguing against that: + * + * - using a flag versus wasting a slot to distinguish full from empty + * - overwrite versus drop when new data won't fit + * - copy in bulk (via `mempcy()`) versus copying slot-by-slot + * + * In the end I decided to just write an application-specific implementation. + * + * + */ + +class SoundInfoCache { +public: + typedef std::chrono::system_clock::duration Duration; + typedef std::chrono::time_point Time; + +private: + AudioFormat fmt; + /// Time per frame, in seconds + double secs_per_frame; + /// Sample size, in bytes + unsigned frame_size; + /* Mutex guarding the ring buffer since instances will be accessed from + multiple threads */ + mutable Mutex mutex; + /// this is the ring buffer + AllocatedArray ring; + /// # of bytes currently in the ring buffer (as distinct from capacity) + std::size_t cb; + /// Valid PCM data exists in buf[p0, p1) + size_t p0, p1; + /// Time t0 corresponds to p0, t1 to p1 + Time t0, t1; + +public: + /* Create a cache storing \a buf_span time's worth PCM data in format + \a audio_format */ + SoundInfoCache(const AudioFormat &audio_format, + const Duration &buf_span); + +public: + /* Add \a size bytes of PCM data to the cache; \a data is assumed to be + PCM data in our audio format */ + void Add(const void *data, size_t size); + AudioFormat GetFormat() const noexcept { + return fmt; + } + /* Read \a nsamp audio samples from the \e beginning of the buffer; will + return false if \a buf is not large enough to accomodate that */ + bool GetFromBeginning(size_t nsamp, void *buf, size_t cbbuf) const; + /* Retrieve \a nsamp PCM samples ending at time \a t; copy them into \a + buf; will return false if this cannot be done for any reason */ + bool GetByTime(size_t nsamp, Time t, void *buf, size_t cbbuf) const; + /// Return true IFF the ring buffer is empty + bool Empty() const; + /// Retrieve the time range for which this cache has data + std::pair Range() const; + /// Return the # of bytes in the buffer (as opposed to buffer capacity) + std::size_t Size() const; +}; + +} // namespace Visualization + +#endif // SOUND_INFO_CACHE_HXX_INCLUDED diff --git a/src/output/plugins/visualization/VisualizationClient.cxx b/src/output/plugins/visualization/VisualizationClient.cxx new file mode 100644 index 0000000000..8b86d4f286 --- /dev/null +++ b/src/output/plugins/visualization/VisualizationClient.cxx @@ -0,0 +1,532 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright The Music Player Daemon Project + +#include "VisualizationClient.hxx" + +#include "Log.hxx" +#include "event/Chrono.hxx" +#include "util/Domain.hxx" + +#include +#include + +const Domain d_vis_client("vis_client"); + +inline +typename std::chrono::microseconds::rep +NowTicks() { + return duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); +} + + +// Invoked when the client connects and the plugin is in the "closed" state. +Visualization::VisualizationClient::VisualizationClient( + UniqueSocketDescriptor fd, + EventLoop &event_loop, + const SoundAnalysisParameters ¶ms): + BufferedSocket(fd.Release(), event_loop), // schedules a read + sound_params(params), + num_samp(params.GetNumSamples()), + protocol_state(ProtocolState::Init), + timer(event_loop, BIND_THIS_METHOD(OnTimer)) +{ } + +// Invoked when the client connects and the plugin is in the "opened" state. +Visualization::VisualizationClient::VisualizationClient( + UniqueSocketDescriptor fd, + EventLoop &event_loop, + const SoundAnalysisParameters ¶ms, + const std::shared_ptr &pcache): + BufferedSocket(fd.Release(), event_loop), // schedules a read + sound_params(params), + num_samp(params.GetNumSamples()), + pcm_state(HavePcmData { + pcache, Visualization::SoundAnalysis(params, pcache) }), + protocol_state(ProtocolState::Init), + timer(event_loop, BIND_THIS_METHOD(OnTimer)) +{ } + +void +Visualization::VisualizationClient::OnPluginOpened( + const std::shared_ptr &pcache) +{ + FmtDebug(d_vis_client, "[{}] VisualizationClient::OnPluginOpened(" + "this:{},tid:{},state:{})", NowTicks(), (size_t)this, + gettid(), (int)protocol_state); + + pcm_state = HavePcmData { + pcache, Visualization::SoundAnalysis(sound_params, pcache) + }; + + HandleFirstFrame(); +} + +void +Visualization::VisualizationClient::OnPluginClosed() +{ + FmtDebug(d_vis_client, "[{}] VisualizationClient::OnPluginClosed(" + "this:{},tid:{},state:{})", NowTicks(), (size_t)this, + gettid(), (int)protocol_state); + + if (IsClosed()) { + Shutdown(); + return; + } + + // Update `pcm_state`... + pcm_state = std::monostate{}; + /* but no need to do anything else. We'll detect the fact that the + plugin is closed during subsequent state transitions & handle it + there. */ +} + +Visualization::VisualizationClient::~VisualizationClient() { + FmtDebug(d_vis_client, "[{}] VisualizationClient::~VisualizationClient()" + "this:{},tid:{},state:{})", NowTicks(), (size_t)this, + gettid(), (int)protocol_state); + // This will be invoked on the main thread; the socket & underlying + // `SocketEvent` will be torn-down on the I/O thread. + timer.Cancel(); +} + +BufferedSocket::InputResult +Visualization::VisualizationClient::OnSocketInput(void *data, + size_t length) noexcept +{ + FmtDebug(d_vis_client, "[{}] VisualizationClient::OnSocketInput(" + "this:{},tid:{},state:{},length:{})", NowTicks(), + (size_t)this, gettid(), + (int)protocol_state, length); + + // We have data available to be read, and it's present in `data`... + if (ProtocolState::Init != protocol_state) { + Shutdown(); + return InputResult::CLOSED; + } + + // attempt to parse it as a CLIHLO message... + ClientHello clihlo; + ParseResult parse_result = ParseClihlo(data, length, clihlo); + if (ParseResult::NEED_MORE_DATA == parse_result) { + return InputResult::MORE; + } else if (ParseResult::ERROR == parse_result) { + LogError(d_vis_client, + "Expected CLIHLO, received invalid message."); + Shutdown(); + return InputResult::CLOSED; + } + + FmtDebug(d_vis_client, "[{}] Got CLIHLO: {}fps, tau={}ms", NowTicks(), + clihlo.requested_fps, clihlo.tau); + + if (0 != clihlo.major_version || 1 != clihlo.minor_version) { + FmtWarning(d_vis_client, "Unexpected protocol version {}.{} " + "requested-- proceeding to serve 0.1.", + clihlo.major_version, clihlo.minor_version); + } + + if (0 == clihlo.requested_fps) { + LogError(d_vis_client, + "Client requested 0fps-- closing connection."); + Shutdown(); + return InputResult::CLOSED; + } + + // OK-- we have timings: + timings = HaveClientInfo { + std::chrono::milliseconds(clihlo.tau), + std::chrono::milliseconds(int(1000. / clihlo.requested_fps)) + }; + + // Seems legit-- compose our response... + ConsumeInput(length); + + next_frame.clear(); + SerializeSrvhlo((std::byte)0, (std::byte)1, back_inserter(next_frame)); + + FmtDebug(d_vis_client, "[{}] Composed a SRVHLO frame, cancelled read, " + "scheduled a write, and shifted to state {}.", NowTicks(), + (int)ProtocolState::SrvHlo); + + // shift state... + protocol_state = ProtocolState::SrvHlo; + // and schedule a write. + event.CancelRead(); + event.ScheduleWrite(); + return InputResult::PAUSE; +} + +void +Visualization::VisualizationClient::OnSocketError( + std::exception_ptr ep) noexcept { + LogError(ep); + Shutdown(); +} + +void +Visualization::VisualizationClient::OnSocketClosed() noexcept { + FmtInfo(d_vis_client, "[{}] VisualizationClient::OnSocketClosed(" + "this:{},tid:{})", NowTicks(), (size_t)this, gettid()); + Shutdown(); +} + +void +Visualization::VisualizationClient::OnSocketReady(unsigned flags) noexcept +{ + FmtDebug(d_vis_client, "[{}] VisualizationClient::OnSocketReady(" + "this:{},tid:{},state:{},flags:{})", NowTicks(), (size_t)this, + gettid(), (int)protocol_state, flags); + + switch (protocol_state) { + case ProtocolState::Init: { + + if (0 == (flags & SocketEvent::READ)) { + FmtError(d_vis_client, "In state Init, got flags {} (" + "which do not contain READ/POLLIN); in this " + "state we expect a CLIHLO message.", flags); + Shutdown(); + return; + } + + if (flags & (SocketEvent::ERROR|SocketEvent::HANGUP)) { + FmtError(d_vis_client, "In state Init, got flags {} " + "which contains ERROR and/or HANGUP, " + "shutting-down.", flags); + Shutdown(); + return; + } + + // Will invoke `OnSocketInput()` + BufferedSocket::OnSocketReady(flags); + break; + } + case ProtocolState::SrvHlo: + HandleSrvHlo(flags); + break; + + case ProtocolState::FrameReady: + HandleFrameReady(flags); + break; + + default: + FmtError(d_vis_client, "VisualizationClient::OnSocketReady(" + "tid: {}, flags: {}) invoked in state {}-- BAILING!", + gettid(), flags, (int)protocol_state); + Shutdown(); + return; + } + +} + +/** + * \brief Update our sound analysis + * + * + * \return true if the analysis was successfully carried-out, false if it was + * not + * + * + * This method could fail to update the analysis for a few reasons: + * + * - the plugin could have been closed (in which case this implementation will + * shift to state ProtocolClosed) - the cache could not contain PCM data for + * the requested offset + * + * If this method returns true, the next FRAME is waiting in next_frame; the + * caller is responsible for scheduling a write. + * + * + */ + +bool +Visualization::VisualizationClient::ComposeSoundAnalysisFrame() +{ + using namespace std::chrono; + + FmtDebug(d_vis_client, "[{}] VisualizationClient::" + "ComposeSoundAnalysisFrame(this:{},tid:{},state:{})", + NowTicks(), (size_t)this, gettid(), (int)protocol_state); + + if (!PluginIsOpen()) { + protocol_state = ProtocolState::ProtocolClosed; + return false; + } + + auto now = system_clock::now(); + HavePcmData &pcm_data = std::get(pcm_state); + if (!pcm_data.analysis.Update(now + timings->tau)) { + return false; + } + + /* At this point, the data we wish to transport on the wire is residing + * inside `pcm_data.analysis`. It needs to be transformed into it's + * interchange format (IEEE 754, big-endian, single precision), and + * moved into a buffer laid-out according to the protocol. That's one + * copy. I don't want to spend a lot of time optimizing this right now, + * but I'd like to avoid a second one-- we'll ask the `SoundAnalysis` to + * `transform()` the data with a unary operator & output iterator we + * provide. */ + + SerializeSoundInfoFrame(pcm_data.analysis, back_inserter(next_frame)); + return true; +} + +/** + * \brief Handle the first frame-- if tau < 0 schedule the timer for -tau ms, + * else write a frame immediately + * + * + * If \c tau is less than zero, schedule a timer for -tau ms and shift state to + * Waiting. + * + * If \c tau is non-negative, attempt to carry-out a sound analysis. + * + * If that succeeds, schedule a write of the newly-populated frame buffer, + * schedule a write, and shift to state FrameReady. + * + * If the analysis failes, cancel any writes, schedule the timer for \a freq ms, + * and shift to state Waiting. + * + * + */ + +void +Visualization::VisualizationClient::HandleFirstFrame() +{ + auto tau = timings->tau; + auto freq = timings->freq; + if (tau < std::chrono::milliseconds::zero()) { + FmtDebug(d_vis_client, "[{}] VisualizationClient::" + "HandleFirstFrame([this:{}]) scheduling a write for " + "{} ms from now & transitioning to state {}.", + NowTicks(), (size_t)this, -tau.count(), + (int)ProtocolState::Waiting); + timer.Schedule(std::chrono::milliseconds(-tau)); + protocol_state = ProtocolState::Waiting; + } + else { + if (ComposeSoundAnalysisFrame()) { + FmtDebug(d_vis_client, "[{}] VisualizationClient::" + "HandleFirstFrame(this:{}) carried out sound " + "analysis, scheduled a write & is shifting to " + "state {}.", NowTicks(), (size_t)this, + (int)ProtocolState::FrameReady); + event.ScheduleWrite(); + timer.Schedule(std::chrono::milliseconds(freq)); + protocol_state = ProtocolState::FrameReady; + } else { + FmtDebug(d_vis_client, "[{}] VisualizationClient::" + "OnPluginOpened(this:{}) failed to perform " + "sound analysis; cancelling any outstanding " + "writes, scheduling another attempt for {}ms " + "from now & shifting to state {}.", + NowTicks(), (size_t)this, freq.count(), + (int)ProtocolState::Waiting); + event.CancelWrite(); + timer.Schedule(std::chrono::milliseconds(freq)); + protocol_state = ProtocolState::Waiting; + } + } +} + +/** + * \brief Handle socket events when in state FrameReady + * + * + * \brief flags Flags indicating the nature of the socket event that occasiioned + * this call + * + * + * This function will handle errors, hangups, and writes. In the last case, it + * will attempt to write the contents of next_frame. If successful, it will + * shift state to Waiting. + * + * + */ + +void +Visualization::VisualizationClient::HandleFrameReady(unsigned flags) +{ + if (0 == (flags & SocketEvent::WRITE)) { + FmtError(d_vis_client, "In state FrameReady, got flags {} " + "(which do not contain WRITE/POLLOUT); in this state " + "we expect to be sending a sound analysis message.", + flags); + Shutdown(); + return; + } + + if (flags & (SocketEvent::ERROR|SocketEvent::HANGUP)) { + FmtError(d_vis_client, "In state FrameReady, got flags {} which " + "contains ERROR and/or HANGUP, shutting-down.", + flags); + Shutdown(); + return; + } + + if (!WriteFrame()) { + return; + } + + // Timer should already be active + protocol_state = ProtocolState::Waiting; +} + +/** + * \brief Handle socket events while in state SrvHlo + * + * + * \brief flags Flags indicating the nature of the socket event that occasiioned + * this call + * + * + * This method expects the event to be a "write ready" and responds by writing + * the contents of next_frame (presumably an SRVHLO message). If successful, and + * the plugin is open, it will handle first frame chores. If the plugin is + * closed, it will shift to state ProtocolClosed. + * + * + */ + +void +Visualization::VisualizationClient::HandleSrvHlo(unsigned flags) +{ + if (0 == (flags & SocketEvent::WRITE)) { + FmtError(d_vis_client, "In state SrvHlo, got flags {} (which " + "do not contain WRITE/POLLOUT); in this state we " + "expect to be sending an SRVHLO message.", flags); + Shutdown(); + return; + } + + if (flags & (SocketEvent::ERROR|SocketEvent::HANGUP)) { + FmtError(d_vis_client, "In state SrvHlo, got flags {} which " + "contains ERROR and/or HANGUP, shutting-down.", + flags); + Shutdown(); + return; + } + + // The SRVHLO should be waiting for us in `next_frame` + if (!WriteFrame()) { + return; + } + + if (PluginIsOpen()) { + HandleFirstFrame(); + } else { + FmtDebug(d_vis_client, "[{}] VisualizationClient::" + "HandleSrvHlo(): The visualization plugin is " + "closed; shifting to state {}.", + NowTicks(), (int)ProtocolState::ProtocolClosed); + protocol_state = ProtocolState::ProtocolClosed; + event.CancelWrite(); + } +} + +void +Visualization::VisualizationClient::LogSocketWriteError( + const socket_error_t &err) const noexcept +{ + if (IsSocketErrorSendWouldBlock(err)) { + LogNotice(d_vis_client, "OnSocketReady invoked, but write " + "would block(!)"); + return; + } else if (!IsSocketErrorClosed(err)) { + SocketErrorMessage msg(err); + FmtWarning(d_vis_client, "Failed to write to client: {}", + (const char *)msg); + } +} + +/* Timer callback-- invoked when it's time to compose the next sound analysis + * frame. This will re-schedule the timer regardless of success or failure of + * the sound analysis. */ +void +Visualization::VisualizationClient::OnTimer() noexcept +{ + FmtDebug(d_vis_client, "[{}] VisualizationClient::OnTimer(this:{}," + "tid:{},state:{})", NowTicks(), (size_t)this, gettid(), + (int)protocol_state); + + if (ComposeSoundAnalysisFrame()) { + FmtDebug(d_vis_client, "VisualizationClient::OnTimer() " + "carried-out sound analysis, scheduled a write, " + "and shifted to state {}.", + (int)ProtocolState::FrameReady); + event.ScheduleWrite(); + protocol_state = ProtocolState::FrameReady; + } else { + // Give up for now-- wait for the next timer event + FmtDebug(d_vis_client, "VisualizationClient::OnTimer() " + "failed to carry-out sound analysis; cancelling " + "outstanding writes, shifting to state {}.", + (int)ProtocolState::Waiting); + event.CancelWrite(); + protocol_state = ProtocolState::Waiting; + } + + timer.Schedule(timings->freq); +} + +void +Visualization::VisualizationClient::Shutdown() noexcept +{ + timer.Cancel(); + event.CancelRead(); + event.CancelWrite(); + BufferedSocket::Close(); + pcm_state = std::monostate{}; + protocol_state = ProtocolState::Done; +} + +bool +Visualization::VisualizationClient::WriteFrame() +{ + ssize_t cb_written = GetSocket().Write({next_frame.begin(), + next_frame.end()}); + if (0 > cb_written) { + LogSocketWriteError(GetSocketError()); + Shutdown(); + return false; + } + + ssize_t cb_expected = next_frame.end() - next_frame.begin(); + + /* Handle the case of a partial write. The SRVHLO frame is always seven + octets in size. */ + if (cb_written < cb_expected) { + FmtWarning(d_vis_client, "VisualizationClient::WriteFrame() " + "wrote {} bytes of message-- expected {}.", + cb_written, cb_expected); + /* It's no problem, just remove the bytes that have been written + * from `next_frame`, schedule another write & bail. */ + next_frame.erase(next_frame.begin(), + next_frame.begin() + cb_written); + event.ScheduleWrite(); + return false; + } + + /* Finally, we should handle the case of `cb_written > 7`. Naturally, + * that "should" never happen, but I just can't leave the case + * uncovered. One could argue that an assertion would be justified, but + * I understand the maintainers to frown on assertions in production + * code, so: */ + if (cb_written > cb_expected) { + FmtError(d_vis_client, "VisualizationClient::HandleSrvHlo() " + "wrote {} bytes, but {} were reported to have been " + "written-out. This should be investigated.", + cb_written, cb_expected); + } + + FmtDebug(d_vis_client, "[{}] VisualizationClient::WriteFrame(tid:{}," + "state:{}) wrote {} bytes (of {}); cancelling any outstanding " + "writes & clearing the frame buffer.", NowTicks(), gettid(), + (int)protocol_state, cb_written, cb_expected); + + event.CancelWrite(); + next_frame.clear(); + + return true; +} diff --git a/src/output/plugins/visualization/VisualizationClient.hxx b/src/output/plugins/visualization/VisualizationClient.hxx new file mode 100644 index 0000000000..d7a79b6e9d --- /dev/null +++ b/src/output/plugins/visualization/VisualizationClient.hxx @@ -0,0 +1,269 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright The Music Player Daemon Project + +#ifndef VISUALIZATION_CLIENT_HXX_INCLUDED +#define VISUALIZATION_CLIENT_HXX_INCLUDED 1 + +#include "SoundAnalysis.hxx" +#include "Protocol.hxx" + +#include "event/BufferedSocket.hxx" +#include "event/FineTimerEvent.hxx" +#include "net/SocketError.hxx" +#include "net/UniqueSocketDescriptor.hxx" + +namespace Visualization { + +class SoundInfoCache; + +/** + * \class VisualizationClient + * + * \brief Represents a TCP connection to one visualization client + * + * + * This class implements the server side of the MPD visualization protocol, + * version 1, for a single client. + * + * The \ref vis_out_plugin_protocol_proto "protocol" suggests a finite state + * machine (FSM): + * + * events: + * + * - read ready + * - write complete + * - timer fired + * - plugin opened + * - plugin closed + * + * actions + * + * - schedule write + * - schedule read + * - cancel write + * - cancel read + * - schedule timer(interval) + * + \code + +------+ + +---->| Init | (read scheduled) + read ready, | +------+ + need more | | | + +------+ | read ready, recv CLIHLO, + | schedule a write + | + v + +--------+ + +--> | SRVHLO |-----------------------------+ + write complete,| +--------+ | + more to write | | | | | write complete, + +-----+ | | | plugin closed + | +----+ | (cancel write) + | | write complete, | + +----+ | plugin open, | + write complete, | | tau < 0 | + plugin open, | | (cancel write) | + tau >= 0 | | (schedule timer(-tau)) | + (update analysis) | | or | + (schedule write) | | failed analysis v + (schedule timer(freq)) | | (cancel write) +--------+ + | | (schedule timer(freq)) | Closed | + v v +--------+ + +------------+ +---------+ + +---->| FrameReady | | Waiting |<----+ + | +------------+ +---------+ | + | | ^ | ^ | | | + +------+ | | | | | | + write complete, | +---------------+ | +-------+ + more to write | write complete | no sound analysis + | (cancel write) | (schedule timer(freq)) + | | + +----------------------+ + timer fired + (schedule timer(freq)) + (schedule write) + + \endcode + * + * This is complicated by the fact that the output plugin that owns us may, at + * any given point in time, be "open" or "closed"; it is only when open that we + * know the format of the PCM data being played, and hence that this client may + * have a reference to the PCM data cache along with a `SoundAnalysis` instance + * necessary for performing sound analsysis. + * + * 1. instances start life waiting for the CLIHLO message (state :=> Init) + * + * 2. on read ready (state must be Init): + * + * 1) complete the read + * + * 2) compose the SRVHLO message + * + * 3) schedule a write + * + * 4) state :=> SrvHlo + * + * 3. on write ready (state must be SrvHlo) + * + * 1) write the current frame + * + * 2) branch: + * + * - if the plugin is closed, state :=> Closed + * - if the client needs the first frame & the plugin is open + * + compose the frame + * + schedule a write + * + state :=> FrameReady + * - else + * + schedule the timer for -tau ms + * + state :=> Waiting + * + * 3. on write ready (state must be FrameReady) + * + * 1) write the current frame + * + * 2) schedule the timer for 1/fps ms + * + * 3) state :=> Waiting + * + * 4. on timer firing (state must be Waiting) + * + * 1) if the plugin is open: + * + * - compose the next frame + * - schedule a write + * - state :=> FrameReady + * + * + */ + +class VisualizationClient : BufferedSocket { + + Visualization::SoundAnalysisParameters sound_params; + size_t num_samp; + + /// Data available to us when the visualization output plugin is open + struct HavePcmData { + // I wish C++ had a `not_null` class + std::shared_ptr pcache; + Visualization::SoundAnalysis analysis; + }; + /// Plugin open/closed state-- cf. PluginIsOpen() + std::variant pcm_state; + + /// The protocol can be represented as an FSM + enum class ProtocolState { + /* FSM initial state; the socket has been established, but no + * communication has taken place; we are expecting a CLIHLO + * message to arrive (i.e. a READ/POLLIN notification) */ + Init, + /* CLIHLO has arrived, we've composed the SRVHLO and are waiting + * for the socket to become available for write */ + SrvHlo, + /* The handshake has been completed, but the plugin is currently + * closed, so we can't perform sound analysis */ + ProtocolClosed, + /// Handshake complete, waiting for the timer to fire + Waiting, + /* Handshake complete, frame composed, waiting for the socket to + * become available for write */ + FrameReady, + /// The socket has been closed and this instance may be reaped + Done, + } protocol_state; + + /// Information available to us once we've parsed the CLIHLO message + struct HaveClientInfo { + std::chrono::milliseconds tau; + std::chrono::milliseconds freq; // 1/fps + }; + /* A tuple whose first member is the offset from song time at which this + * client has requested sound analysis, and the second is the interval + * at which frames shall be sent (1/fps)-- both are only available to us + * after the CLIHLO message has been parsed and we are in state SrvHlo + * or later. */ + std::optional timings; + /// Timer governing frame transmission + FineTimerEvent timer; + /* Next frame to be transmitted (if any) in serialized format + * (i.e. ready to be written directly); empty `vector` denotes no such + * frame */ + std::vector next_frame; + +public: + /* Constructor invoked when a new client connects & the plugin is + closed */ + VisualizationClient( + UniqueSocketDescriptor fd, EventLoop &event_loop, + const Visualization::SoundAnalysisParameters ¶ms); + /// Constructor invoked when a new client connects & the plugin is open + VisualizationClient( + UniqueSocketDescriptor fd, EventLoop &event_loop, + const Visualization::SoundAnalysisParameters ¶ms, + const std::shared_ptr &pcache); + virtual ~VisualizationClient(); + + /// Invoked by the server when the plugin is opened + void OnPluginOpened( + const std::shared_ptr &pcache); + /// Invoked by the server when the plugin is closed + void OnPluginClosed(); + bool + IsClosed() const noexcept { + return ProtocolState::Done == protocol_state; + } + +protected: + + ///////////////////////////////////////////////////////////////////////// + // BufferedSocket interface // + ///////////////////////////////////////////////////////////////////////// + + virtual BufferedSocket::InputResult + OnSocketInput(void *data, size_t length) noexcept override; + virtual void OnSocketError(std::exception_ptr ep) noexcept override; + virtual void OnSocketClosed() noexcept override; + + /** + * Invoked when an event has occurred on this socket. \a flags + * will be a bitmask made of members of the EPollEvents enumeration. + * For reference: + * + * - READ = EPOLLIN = 1 + * - WRITE = EPOLLOUT = 4 + * - ERROR = EPOLLERR = 8 + * - HANGUP = EPOLLHUP = 16 + * + */ + virtual void OnSocketReady(unsigned flags) noexcept override; + +private: + + /// Update our sound analysis + bool ComposeSoundAnalysisFrame(); + /* Handle the first frame-- if tau < 0 schedule the timer for -tau ms, + * else write a frame immediately */ + void HandleFirstFrame(); + /// Handle a socket event while in state FrameReady + void HandleFrameReady(unsigned flags); + /// Handle a socket event while in state SrvHlo + void HandleSrvHlo(unsigned flags); + /// Utility function-- log a socket_error_t after an attempted write + void LogSocketWriteError(const socket_error_t &err) const noexcept; + /* Timer callback-- invoked when it's time to compose the next sound + * analysis frame */ + void OnTimer() noexcept; + bool + PluginIsOpen() const { + return 0 != pcm_state.index(); + } + /* Close our underlying socket, drop our shared cache & shift state to + * Done */ + void Shutdown() noexcept; + bool WriteFrame(); + +}; + +} // namespace Visualization + +#endif // VISUALIZATION_CLIENT_HXX_INCLUDED diff --git a/src/output/plugins/visualization/VisualizationOutputPlugin.cxx b/src/output/plugins/visualization/VisualizationOutputPlugin.cxx new file mode 100644 index 0000000000..8406e0aaa4 --- /dev/null +++ b/src/output/plugins/visualization/VisualizationOutputPlugin.cxx @@ -0,0 +1,712 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright The Music Player Daemon Project + +#include "VisualizationOutputPlugin.hxx" +#include "SoundAnalysis.hxx" +#include "SoundInfoCache.hxx" +#include "VisualizationServer.hxx" + +#include "Log.hxx" +#include "config/Block.hxx" +#include "event/Call.hxx" +#include "output/Interface.hxx" +#include "output/OutputPlugin.hxx" +#include "util/Domain.hxx" + +#include +#include // for gettid() + +namespace Visualization { + +/** + * \page vis_out_protocol Visualization Network Protocol + * + * See \ref vis_out "RFC: Visualization Output Plugin" for background. + * + * + * \section vis_out_protocol_timing Timing + * + * In order to deliver sound data to the client at the proper time, the protocol + * needs to take into account: + * + * - network latency: the delta between writing the sound data to the socket & + * its receipt at the client + * + * - player buffering: the player may buffer sound data (mplayer, for instance, + * buffers half a second's worth of audio before beginning playback by + * default) + * + * - render time: the client presumably wishes the current frame to appear + * on-screen at the moment the current sound information is ending + * + * Throughout, let \e t be "song time" be measured on the server, and T(t) be + * sound information for song time \e t. Let FPS be the frames-per-second at + * which the client would like to render. + * + * Then, at an interval of 1/FPS seconds, the server needs to write + * + \verbatim + T(t - {buffer time} + {render time} + {one way latency}) + \endverbatim + * + * to the client socket. If we denote that time offset (i.e. the render time + + * one-way latency minus the buffer time) by tau, then the server should wait + * max(0, -tau) ms to write the first frame. + * + * A few examples will illustrate. + * + * \subsection vis_out_protocol_timing_eg_1 Example 1 + * + * Let the client render time be 4ms and round-trip network latency be + * 6ms. Assume no player buffering. In order to render a frame corresponding to + * song time \e t, the client would need, at time \e t - 4 ms, sound information + * corresponding to time \e t, or T(t). The server would need to \e send that + * information at time \e t - 7ms (half of one round-trip plus render time). + * + * In other words, on the server side at song time \e t, we would need to write + * T(t + 7ms) to the client socket. If the server writes T(t+7ms) immediately, + * the client will receive it at \e t + 4ms, take 4ms to render the next frame, + * and so at \e t + 7ms hence, finish rendering T(t+7). + * + * \subsection vis_out_protocol_timing_eg_2 Example 2 + * + * Imagine we are running the same client on a laptop, connected to an MPD + * server over the internet, and using mplayer as the player. This gives 500ms + * of buffer time. Let us assume the same 4ms render time, but now a 20ms + * round-trip time. + * + * In order to render a frame corresponding to song time \e t, the client would + * need, at time \e t - 4ms, T(t). This would need to be sent from the server at + * time \e t - 14ms. We now need to incorporate the client-side buffering, + * however. Song time \e t will be actually played on the client at \e t + 500ms + * on the server. + * + * In other words, on the server side at song time \e t, we would need to write + * T(t-486ms) to the client socket. + * + * Since the sound won't start on the client for 0.5 sec, it would make no sense + * to begin writing sound information for 486ms. Let t(0) be the moment the + * client connects and the player begins buffering. If, at t(0) + 486ms, the + * server writes T(t(0)), the client will receive it at t(0) + 496ms & complete + * rendering it at t(0) + 500ms, which is when the client-side player will + * begin playing song time t(0). + * + * \section vis_out_protocol_proto The Protocol + * + * \subsection vis_out_protocol_proto_design Design + * + * The author is unaware of any existing network protocols in this area, so he + * designed his own after reviewing the Shoutcast & Ultravox + * protocols. Experience with the TLS & 802.11 protocols also informed this + * design. + * + * Design goals include: + * + * - client convenience + * - this in particular drove the choice to stream updates; everything + * needed to simply push the data out is knowable at handshake time, + * so why force the client to send a request? + * - efficiency on the wire + * - binary format + * - streaming preferred over request/response + * - future extensibility + * - protocol versioning built-in from the start + * - parsing convenience + * - streaming messages come with a few "magic bytes" at the start + * to assist clients in "locking on" to the stream & recovering from + * corrupted data, client-side disruptions & so forth + * - all messages conform to the "type-length-value" (TLV) format + * beloved of parser writers + * + * Responses to the intial + * RFC also + * informed the protocol's first implementation: I've stripped out all but the + * essentials in pursuit of a minimally effective protocol that is still + * extensible should it prove useful + * + * + * \subsection vis_out_protocol_proto_overview Overview + * + * The protocol is a combination of request/response as well as streaming. After + * an initial handshake (client goes first) the server will begin streaming + * messages to the client; i.e. at the interval the client specified during the + * initial handshake the server will send FRAME messages containing sound + * information useful for visualizers. The client need not request these + * messages or does the client need to acknowledge them in any way. + * + * Schematically, a conversation looks like this: + * + \verbatim + Client Server + + desired protocol version + tau (buffer offset) + frame rate --------- CLIHLO ---------> + ... + + <-------- SRVHLO --------- offered protocol version + + <-------- FRAME --------- samples, spectrum + | 1/fps sec + <-------- FRAME --------- samples, spectrum + ... + (forever) + \endverbatim + * + * There is no formal "close" or "teardown" message; each side simply detects + * when the other has gone away & treats that as the end of the conversation. + * + * + * \subsection vis_out_protocol_proto_msgs Messages + * + * All messages: + * + * - integers use network byte order (i.e. big endian) + * - use TLV format (streaming messages prepend magic bytes) + * + \verbatim + + +---------+-----------------------+-----------------+-----------------------+--------+ + |(prefix) | TYPE (16-bit unsigned)| LENGTH | PAYLOAD | CHECK | + | | class | message type | 16-bits unsigned| LENGTH bytes | 1 byte | + |---------|-------+---------------|-----------------|-----------------------+--------+ + |63ac84003| 4 bits| 12 bits | (max len 65535) | format is msg-specfic | 00 | + +---------+-----------------------+-----------------+-----------------------+--------+ + + \endverbatim + * + * Notes: + * + * - the prefix is only prepended to FRAME messages to enable clients to "lock + * on" to a pre-existing stream of data; 0x63ac4003 were the first four bytes + * I pulled from \c /dev/urandom on my dev workstation on Monday, September 04. + * + * - the message type is comprised of two values packed into a u16_t: + * + * - class: (type & 0xf000) >> 12: + * - 00: handshake + * - 01: streaming (FRAME, e.g.) + * + * - message type: (type & 0ffff) see below for values + * + * - the "length" field is the length of the \e payload \e only (i.e. \e not the + * length of the entire message) + * + * - the "check" byte is intended as a sanity test & shall always be zero + * Although, what would the client do if the check failed? There's no + * provision in this protocol to re-request the frame. Discard it, I suppose. + * + * The following subsections define the PAYLOAD portion of the above messages. + * + * \subsubsection vis_out_protocol_proto_clihlo CLIHLO + * + * No prefix. The class is 0x0 (handshake) & the message type is 0x000. + * + * Payload: + * + \verbatim + + +---------------+---------------+---------------+---------------+ + | major version | minor version | requested FPS | requested TAU | + | ------------- | ------------- |-------------- |---------------+ + | uint8_t | uint8_t | uint16_t | int16_t | + +---------------+---------------+---------------+---------------+ + + \endverbatim + * + * Payload size: 6 octets + * + * \subsubsection vis_out_protocol_proto_srvhlo SRVHLO + * + * No prefix. The class is 0x0 (handshake) & the message type is 0x001. + * + * Payload: + * + \verbatim + + +---------------+---------------+ + | major version | minor version | + | ------------- | ------------- | + | uint8_t | uint8_t | + +---------------+---------------+ + + \endverbatim + * + * \subsubsection vis_out_protocol_proto_frame FRAME + * + * Prefix. The class is 0x1 (streaming) & the message type is 0x000. + * + * Below, \c float denotes a floating-point value, expressed in IEEE 754 + * single-precision format, in big-endian byte order. \c complex denotes a pair + * of floating-point values (the real & imaginary components of a complex + * number, in that order) in the same format. + * + * Payload: + * + \code + + +----------+----------+-------------+-----------+----------+---------+---------+----------+------------+---------------+-----------------+ + | num_samp | num_chan | sample_rate | waveforms | num_freq | freq_lo | freq_hi | freq_off | coeffs | power_spectra | bass/mids/trebs | + | -------- | -------- | ----------- | --------- | -------- | ------- | ------- | -------- | ---------- | ------------- | --------------- | + | uint16_t | uint8_t | uint16_t | see below | uint16_t | float | float | uint16_t | see below | see below | see below | + +----------+----------+-------------+-----------+----------+---------+---------+----------+------------+---------------+-----------------+ + + waveforms: + + +----------------------+----------------------+-----+---------------------------------+ + | waveform for chan. 0 | waveform for chan. 1 | ... | waveform for chan. num_chan - 1 | + | -------------------- | -------------------- | ... | ------------------------------- | + | float | ... | float | float | ... | float | ... | float | ... | float | + | -------------------- | -------------------- | ... | ------------------------------- | + | (num_samp floats) | (num_samp floats) | ... | (num_samp floats) | + +----------------------+----------------------+-----+---------------------------------+ + + total: num_samp * num_chan * 4 octets + + coeffs: + + +--------------------------+--------------------------+-----+-------------------------------------+ + | freq. domain for chan. 0 | freq. domain for chan 1. | ... | freq. domain for chan. num_chan - 1 | + | ------------------------ + -------------------------+---- + ----------------------------------- | + | complex | ... | complex | complex | ... | complex | ... | complex | complex | ... | complex | + | ------------------------ +--------------------------+-----+-------------------------------------| + | num_freq complex | num_freq complex | ... | num_freq complex | + +--------------------------+--------------------------+-----+-------------------------------------+ + + total: num_chan * num_freq * 8 octets + + power spectra: + + +-----------------------------+-----+---------------------------------------+ + | power spectrum for chan. 0 | ... | power spectrum for chan. num_chan - 1 | + | --------------------------- +-----+ ------------------------------------- | + | float | float | ... | float | ... | float | float | ... | float | + | --------------------------- + --- + ------------------------------------- | + | num_freq floats | ... | num_freq floats | + +-----------------------------+-----+---------------------------------------+ + + total: num_chan * num_freq * 4 octets + + bass/mids/trebs + + +-----------------------------+-----+----------------------------------------+ + | bass/mids/trebs for chan. 0 | ... | bass/mids/trebs for chan. num_chan - 1 | + | --------------------------- +-----+ -------------------------------------- | + | float | float | float | ... | float | float | float | + +-----------------------------+-----+----------------------------------------+ + + total: num_chan * 12 octets + + payload size: 17 + num_samp * num_chan * 4 + num_chan * num_freq * 8 + num_chan * num_freq * 4 + num_chan * 12 + = 17 + 4 * num_chan * (num_samp + 3 * num_freq + 3) + + \endcode + * + * - \c num_samp: the number of audio samples used in this analysis: this is set + * in plugin confiugration and in practice needn't be particularly large (512 + * is the default setting). This determines the number of values in + * \c waveforms, and in part the number of values in \c frequencies and + * \c power_spectra (see below) + * + * - \c num_chan: the number of audio channels used in this analysis: this is + * determined by the audio stream being played at any given time, but 2 + * (i.e. stereo) is typical + * + * - \c sample_rate: the number of samples per second at which this audio stream + * is encoded (44100 is typical) + * + * - \c waveforms: the PCM data on which this analysis was based; there will be + * \c num_chan sets of num_samp floats (one for each channel, arranged one + * after the other; i.e. not interleaved) + * + * - \c num_freq: the number of frequency values returned for each waveform in + * this frame; this is a function the sample rate, the number of audio + * samples, and the frequency cutoffs with which the plugin was configured (on + * which more below) + * + * - \c freq_lo, \c freq_hi: the frequency range returned; this is set in plugin + * configuration. The range of human perception is roughly 200Hz to 20,000Hz, + * but in practice musical sound data contains little information above 10-12K + * Hz, so a typical setting for this range is 200Hz and 10000Hz. + * + * - \c freq_off: the index corresponding to \c freq_lo; this can be used by the + * caller to map a Fourier coefficient to a frequency (see \c coeffs, below) + * + * - \c coeffs: the Fourier coefficients for each waveform, expressed as complex + * numbers; the i-th value in this range is the \c freq_off + \c i -th Fourier + * coefficient, corresponding to a frequency of + * + \code + + (freq_off + i) * samp_rate + --------------------------- Hz + num_samp + + \endcode + * + * The reason for this convention is that the plugin will _only_ return the + * Fourier coefficients within the ranage defined by \c freq_lo & \c freq_hi. + * + * Note that Discrete Fourier Transforms of real-valued series (such as our PCM + * waveform) display the Hermitian property: + * + \code + * + C(i) = C(n-i) + + \endcode + * + * where '*' denotes complex conjugation. Many libraries take advantage of this + * to save space by only returning the first n/2 + 1 Fourier coefficients (since + * the remaining coefficients can be readily computed from those). The + * application of a frequency window spoils this nice symmetry. + * + * - \c power_spectra: the power spectrum for each channel; this is merely the + * magnitude of the Fourier coefficent at each frequency. Strictly speaking + * the client could compute this for themselves, but this is such a frequently + * used value the plugin computes & transmits it as a convenience to the + * caller, There are again \c num_freq values. + * + * - bass/mids/trebs: once the frequency domain is truncated to the given + * bounds, the number of octaves therein is divided into three equal + * bands and the power in each band is summed (this is done separately + * for each channel) + * + * A number of these quantities won't change; they're defined in plugin + * configuration; \c num_samp, \c freq_lo & \c freq_hi could, in principle, be + * moved to the SRVHLO message. + * + * Furthermore, \c num_chan, \c sample_rate and hence \c num_freq are set at the + * start of each new audio stream, and so could be communicated once at that + * point & omitted from subsequent frames. + * + * That said, this would complicate client implementations for the sake of + * saving a few bytes on the wire; I've chosen to simply communicate this + * information in each frame. + * + * + */ + +/** + * \page vis_out_arch Layout of the Visualization Output Plugin + * + * \section vis_out_arch_intro Introduction + * + * There are, at the time of this writing, two other output plugins that provide + * socket servers: HttpdOutput & SnapcastOutput. They both follow a similar + * pattern in which the plugin subclasses both AudioOutput \e and + * ServerSocket. Since I have chosen a different approach, I should both + * describe the layout of VisualizationOutput and explain my choice. + * + * \section vis_out_arch_cyclic Cyclic Dependencies + * + * While they subclass privately (implying an "implemented-in-terms-of" rather + * than "is-a" relationship with their superclasses), HttpdOutput & + * SnapcastOutput in practice handle the duties of being both an AudioOutput and + * a ServerSocket. This introduces not one but two cyclic dependencies in their + * implementations: + * + * 1. the ServerSocket half of them is responsible for creating new clients, but + * the clients are the ones who detect that their socket has been closed; they + * then need a back-reference to signal their parent that they should be + * destroyed (by calling RemoveClient() through their back-reference). + * + * 2. the AudioOutput half of them is responsible for pushing new data derived + * from PCM data out to all their clients, while their clients request + * information & service from their parent, again requiring a back reference + * (GetCodecName() on the Snapcast client, e.g.) + * + * Cyclic dependencies carry with them drawbacks: + * + * - they increase compilation times because when one file in the cycle is + * changed, all the other translation units need to be recompiled + * + * - they increase coupling, increasing the chances that a change in + * one place will break others + * + * - code reuse becomes more difficult-- trying to hoist one file out involves + * bringing all the other files in the cycle along with it + * + * - unit testing becomes harder-- the smallest unit of testable + * funcationality becomes the union all the the translation units in the + * cycle + * + * \section vis_out_arch_threads Too Many Threads! + * + * This arrangement entails another problem: HttpdOutput & SnapcastOutput + * instances have their methods invoked on two threads; the main I/O thread as + * well as the player control thread. This means that access to some state needs + * to be guarded by a mutex (in the case of HttpdOutput, the client list & the + * pages), but \e not others (again in the case of HttpdOutput, content or + * genre). + * + * \section vis_out_arch_demotion Breaking Dependency Cyles Through Demotion + * + * I instead chose to have VisualizationOutput \e be an AudioOutput, and \e own + * a ServerSocket. The state & behavior required by both is pushed down into + * class SoundInfoCache on which both depend. This arrangement breaks things up + * in a few ways. + * + * Cycle 1 is broken up by having a one-way relationship only between the socket + * server & clients. When a client detects that its socket has been closed, it + * marks itself "dead" and will eventually be reaped by the server. + * + * Cycle 2 is broken by Lakos' method of "demotion": the functionality required + * by both the output plugin & the various clients is pushed down into a + * separate class SoundInfoCache. It is owned by the plugin, and referenced by + * clients. When the plugin is disabled, the plugin is responsible for + * cleaning-up the server, which will in turn clean-up all the clients, and only + * then destroying the SoundInfoCache instance. + * + * In ASCII art: + * + \verbatim + sound +---------------------+ +---------------------+ + -- data ----> | VisualizationOutput | --- owns ---> | VisualizationServer | + +---------------------+ +---------------------+ + | Play() | | OnAccept() | + +---------------------+ +---------------------+ + 1 | | 1 + | +---owns----+ + | | + | v * + | +---------------------+ + owns | VisualizationClient | + | +---------------------+ + | | * + | +----references------+ + | | + 1 v v 1 + +----------------+ + | SoundInfoCache | + +----------------+ + \endverbatim + * + * This arrangement also addresses the threading issue: other than creation & + * destruction, the socket server has all of its methods invoked on the I/O + * thread, and those of the plugin on the player control thread. The state that + * needs to be guarded against access from multiple threads is localized in + * SoundInfoCache. + * + * + * \section vis_out_arch_promotion A Discarded Approach + * + * The \ref vis_out_back "idea" of having sound analysis accessible through the + * MPD client + * protocol + * to me begged the question: why not have SoundInfoCache be owned directly by + * MultipleOutputs? MPD clients could make requests directly via + * + \code + partition.outputs.sound_info_cache.analyze(...); + \endcode + * + * We could hand a reference to it to the visualization output plugin, and have + * the plugin be solely responsible for serving the network protocol. + * + * I saw a few advantages to this: + * + * 1. Convenient access for the implementations of MPD client protocol commands + * + * 2. Users could get sound analysis via the MPD client protocol without having + * to configure & enable an output plugin + * + * 3. General simplification-- the output plugin would only be responsible + * for serving the network protocol + * + * All that said, I discarded this approach. If I wanted the sound analysis to + * receive sound data post-cross-fade, post-replay gain and after any other + * filtering, it was going to need to own an AudioOutputSource instance. Thing + * is, when I open an AudioOutputSource I need: + * + * - the AudioFormat + * - a reference to the MusicPipe + * - the ReplayGain filter(s) + * - any other filters + * + * MultipleOutputs doesn't know these; it's just got a bunch of + * configuration. The configuration gets turned into these objects in + * FilteredAudioOutput::Setup() and it's non-trivial to do so. The plumbing is + * complex enough that I'm inclined to leave it where it is. So now we're at a + * point where SoundInfoCache would need to own both an AudioOutputSource \e and + * a FilteredAudioOutput... at which point it starts to look very much like an + * AudioOutputControl (in other words, just another audio output under + * MultipleOutputs). + * + * + */ + +/** + * \class VisualizationOutput + * + * \brief An output plugin that serves data useful for music visualizers + * + * \sa \ref vis_out_plugin_arch "Architecture" + * + * + * Both the fifo & pipe output plugins can be used to directly access the PCM + * audio data, and so can (and have been) used to implement music visualizers + * for MPD. They are, however, limited to clients running on the same host as + * MPD. This output plugin will stream PCM samples along with derived + * information useful for visualizers (the Fourier transform, bass/mids/trebs, + * and so forth) over one or more network connections, to allow true MPD client + * visualizers. + * + * + */ + +class VisualizationOutput: public AudioOutput { + + /* When the plugin is enabled, we actually "open" the server (which is + * to say, bind the socket & begin accepting incoming connections) */ + VisualizationServer server; + /* This will be null unless the plugin is open; it's a `shared_ptr` + * because we share references with the socket servers and the + * `VisualizationClient` instances representing active connections */ + std::shared_ptr pcache; + /// The number of seconds' worth of audio data to be cached + std::chrono::seconds cache_duration; + +public: + static AudioOutput* Create(EventLoop &event_loop, + const ConfigBlock &cfg_block) { + return new VisualizationOutput(event_loop, cfg_block); + } + VisualizationOutput(EventLoop &event_loop, + const ConfigBlock &cfg_block); + + virtual ~VisualizationOutput(); // We have virtuals, so... + +public: + + //////////////////////////////////////////////////////////////////////// + // AudioOutput Interface // + //////////////////////////////////////////////////////////////////////// + + /** + * Enable the device. This may allocate resources, preparing + * for the device to be opened. + * + * Throws on error. + */ + virtual void Enable() override; + + /** + * Disables the device. It is closed before this method is called. + */ + virtual void Disable() noexcept override; + + /** + * Really open the device-- mandatory. + * + * Throws on error. + * + * @param audio_format the audio format in which data is going + * to be delivered; may be modified by the plugin + */ + virtual void Open(AudioFormat &audio_format) override; + + /** + * Close the device-- mandatory. + */ + virtual void Close() noexcept override; + + /** + * Play a chunk of audio data-- mandatory. The method blocks until at + * least one audio frame is consumed. + * + * Throws on error. + * + * May throw #AudioOutputInterrupted after Interrupt() has + * been called. + * + * @return the number of bytes played (must be a multiple of + * the frame size) + */ + virtual size_t Play(std::span src) override; + +}; + +} // namespace Visualization + +using std::make_unique; + +const Domain vis_output_domain("vis_output"); + +Visualization::VisualizationOutput::VisualizationOutput( + EventLoop &event_loop, + const ConfigBlock &config_block): + AudioOutput(FLAG_ENABLE_DISABLE | FLAG_PAUSE), + server(event_loop, + config_block.GetBlockValue("bind_to_address"), + config_block.GetBlockValue("port", 8001U), + config_block.GetPositiveValue("max_clients", 0), + Visualization::SoundAnalysisParameters(config_block)), + cache_duration(config_block.GetPositiveValue("cache_duration", 1)) +{ } + +Visualization::VisualizationOutput::~VisualizationOutput() +{ } + +void +Visualization::VisualizationOutput::Enable() { + + FmtInfo(vis_output_domain, "VisualizationOutput::Enable({})", gettid()); + + BlockingCall(server.GetEventLoop(), [this](){ + server.Open(); + }); + +} + +void +Visualization::VisualizationOutput::Disable() noexcept { + + FmtInfo(vis_output_domain, "VisualizationOutput::Disable({})", gettid()); + + BlockingCall(server.GetEventLoop(), [this](){ + server.Close(); + }); + +} + +void +Visualization::VisualizationOutput::Open(AudioFormat &audio_format) +{ + FmtInfo(vis_output_domain, "VisualizationOutput::Open({})", gettid()); + + /* At this point, we know the audio format, so we can at this point + * instantiate the PCM data cache. */ + pcache = make_shared(audio_format, + cache_duration); + + BlockingCall(server.GetEventLoop(), [this, audio_format]() { + server.OnPluginOpened(pcache); + }); +} + +void +Visualization::VisualizationOutput::Close() noexcept +{ + FmtInfo(vis_output_domain, "VisualizationOutput::Close({})", gettid()); + + BlockingCall(server.GetEventLoop(), [this]() { + server.OnPluginClosed(); + }); + + pcache = nullptr; +} + +size_t +Visualization::VisualizationOutput::Play(const std::span src) +{ + pcache->Add(src.data(), src.size()); + return src.size(); +} + +const struct AudioOutputPlugin visualization_output_plugin = { + "visualization", + nullptr, // cannot serve as the default output + &Visualization::VisualizationOutput::Create, + nullptr, // no particular mixer +}; diff --git a/src/output/plugins/visualization/VisualizationOutputPlugin.hxx b/src/output/plugins/visualization/VisualizationOutputPlugin.hxx new file mode 100644 index 0000000000..a2a9001462 --- /dev/null +++ b/src/output/plugins/visualization/VisualizationOutputPlugin.hxx @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright The Music Player Daemon Project + +#ifndef MPD_VISUALIZATION_OUTPUT_PLUGIN_HXX +#define MPD_VISUALIZATION_OUTPUT_PLUGIN_HXX + +/** + * \page vis_out The Visualization Output Plugin + * + * \section vis_out_intro Introduction + * + * Unlike most output plugins, which provide sound data in one format or + * another, this plugin provides data \e derived from the current audio stream + * convenient for authors of + * music visualizers. + * + * \section vis_out_back Background + * + * This plugin started from a conversation on the #mpd IRC channel. I asked + * about the best way to implement a music visualizer as a remote MPD + * client. All of the MPD visualizers of which I was aware used the + * fifo + * output plugin and consequently had to be run on the same host as the MPD + * daemon. It was suggested that I write an output plugin that would stream the + * data needed to implement a visualizer. + * + * I submitted an + * RFC + * in which we kicked around the ideas of implementing the simplest protocol + * first, and of exposing sound information not only over a network protocol + * (like, say, the HttpdOutput plugin), but also over the MPD + * client protocol. + * + * This plugin is the result of those conversations. + * + * \subsection vis_out_prior Prior Art + * + * Music visualization sources which I consulted before settling on this approach: + * + * - This PR + * proposed solving the problem by implementing an output plugin that would + * stream the raw PCM data over TCP, the idea being that the remote visualizer + * would do the sound analysis client-side. The PR was discarded as being + * redundant with the \c HttpdOutput plugin. I would also observe that such a + * solution sends \e far more data on the wire than is needed for + * visualization. + * + * - ncmpcpp uses the + * FifoOutput plugin, and as such can only provide the visualization feature + * when it's being run locally. The sound analysis is limited, as well (on + * which more below). + * + * - cli-visualizer will + * work with the MPD FIFO output plugin (again assuming the MPD daemon is + * running locally). Limited sound analysis, as well. + * + * - MilkDrop: + * reading the source code was very instructive in terms of sound analysis for + * music visualization; that aspect of this plugin is largely based on it. + * + * + * \section vis_out_plugin The Plugin + * + * A new output plugin "visualization" is provided. The plugin "plays" PCM data + * by caching it. It provides continuous sound analysis at some caller-supplied + * offset of the current song time consisting of PCM samples, Fourier + * coefficients, frequency information & so forth. Like \c HttpdOutput and + * \c SnapcastOutput, the plugin includes a socket server that will provide a + * network endpoint at which clients can access sound analysis. In the future, + * analysis may be made available over the MPD client protocol as well. + * + * + * \subsection vis_output_plugin_arch Architecture + * + * VisualizationOutput is like HttpdOutput and SnapcastOutput in that it + * implements both an AudioOutput and a socket server. Unlike those two + * implementations, I chose not to multiply inherit from AudioOutput & + * ServerSocket. The are more details \ref vis_out_arch "here", but briefly: I + * chose to have VisualizationOutput \e own a ServerSocket rather than \e be a + * ServerSocket, and pushed the responsibility for caching PCM data down into + * class SoundInfoCache on which both my output plugin & socket server + * depend. This arrangement is intended to both break-up circular dependencies + * among the classes involved as well as reduce the number of places in which + * objects are accessed by multiple threads. + * + * + * \subsection vis_output_plugin_analysis Sound Analysis + * + * Given audio data in raw PCM format, a number of steps may be taken to analyze + * that data & produce information useful to visualizer authors. This section + * describes the full pipeline briefly. Most of these steps are optional at + * request-time and are described in greater detail in the relevant docs. + * + * - the PCM data may optionally be damped by taking a weighted average between + * the current values & prior values in the time domain; this will have the + * effect of reducing noise in the higher frequency ranges + * + * - the PCM data may have a window function applied to it in the time domain + * around the time of interest; such a function has the effect of "dialing + * down" audio samples further from the timestamp of interest and again will + * reduce higher-frequency noise; the size of the window may be configured to + * incorporate more or less data as desired. + * + * - the resulting PCM data will be shifted into the frequency domain by + * application of the Discrete Fourier Transform + * + * - the human ear can only distinguish frequencies from (about) 200Hz to + * 20000Hz, and in practice musical sound information doesn't show much + * activity above 10000Hz; it is therefore convenient to throw out frequency + * data outside some (configurable) frequency range + * + * - it is also convenient to divide the resulting spectrum into a few coarse + * bands, such as bass/mids/trebs. This is computationally non-trivial because + * perceptually, frequency is not linear, it's logrithmic. A change of one + * octave corresponds to a doubling in frequency. Intuitively, this means that + * the difference betwenn 200 & 300Hz is much greater than the difference + * betwen 5000 & 5100Hz, e.g. The plugin will peform this service for clients. + * + * - it can also be useful to maintain a weighted time average of the activity + * in each frequency range for purposes of beat detection + * + * + * \subsection vis_output_protocol The Protocol + * + * The specifics of sound analysis are defined in the plugin configuration & are + * identical for all clients. When clients connect, they provide the frame rate + * at which they would like to receive updates and the offset between + * client-side render time & server-side song time (to account for network lag, + * client-side buffering & the time needed to render each frame). Once that + * initial handshake is complete, the server will stream updates containing + * sound analysis results at regular intervals to the client. + * + * Note that each update need only be based on relatively few samples (Winamp, + * e.g. used 576). This will keep the data transferred on the wire small (at + * least by comparison to, say, the httpd output plugin which of course needs to + * send the entire song). Casting the protocol in terms of client-side FPS + * allows us to avoid a "request/response" protocol & simply stream until the + * client goes away. + * + * The protocol specification has its own \ref vis_out_protocol "page". + * + * + */ + +extern const struct AudioOutputPlugin visualization_output_plugin; + +#endif diff --git a/src/output/plugins/visualization/VisualizationServer.cxx b/src/output/plugins/visualization/VisualizationServer.cxx new file mode 100644 index 0000000000..8dd0287b25 --- /dev/null +++ b/src/output/plugins/visualization/VisualizationServer.cxx @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright The Music Player Daemon Project + +#include "VisualizationServer.hxx" + +#include "Log.hxx" +#include "config/Block.hxx" +#include "util/Domain.hxx" + +using std::make_unique, std::move; + +const Domain vis_server_domain("vis_server"); + +Visualization::VisualizationServer::VisualizationServer( + EventLoop &event_loop, + const char *bind_to_address, + uint16_t port, + size_t max_clients_in, + const SoundAnalysisParameters ¶ms_in) +: ServerSocket(event_loop), + max_clients(max_clients_in), + reaper(event_loop, BIND_THIS_METHOD(ReapClients)), + sound_params(params_in) +{ + FmtInfo(vis_server_domain, "VisualizationServer::VisualizationServer(" + "{}:{}, {} clients maximum)", bind_to_address, port, + max_clients); + + ServerSocketAddGeneric(*this, bind_to_address, port); +} + +void +Visualization::VisualizationServer::ReapClients() noexcept +{ + FmtNotice(vis_server_domain, "VisualizationServer::ReapClients({}, " + "{} clients)", gettid(), clients.size()); + + for (auto p0 = clients.begin(), p1 = clients.end(); p0 != p1; ) { + auto p = p0++; + if (p->IsClosed()) { + LogInfo(vis_server_domain, "Reaping closed client."); + clients.erase(p); + } + } + + if (!clients.empty()) { + LogInfo(vis_server_domain, "Scheduling another reaping in 3 " + "seconds."); + reaper.Schedule(std::chrono::seconds(3)); + } +} + +void +Visualization::VisualizationServer::OnPluginOpened( + const std::shared_ptr &pcache) +{ + state = HavePcmData{pcache }; + + for (auto p0 = clients.begin(), p1 = clients.end(); p0 != p1; ) { + auto p = p0++; + if (! p->IsClosed()) { + p->OnPluginOpened(pcache); + } + } +} + +void +Visualization::VisualizationServer::OnPluginClosed() +{ + state = std::monostate {}; + + for (auto p0 = clients.begin(), p1 = clients.end(); p0 != p1; ) { + auto p = p0++; + if (! p->IsClosed()) { + p->OnPluginClosed(); + } + } + +} + +void +Visualization::VisualizationServer::OnAccept(UniqueSocketDescriptor fd, + SocketAddress /*address*/, + int) noexcept +{ + FmtInfo(vis_server_domain, "VisualizationServer::OnAccept({})", + gettid()); + + // Can we allow an additional client? + if (max_clients && clients.size() >= max_clients) { + FmtError(vis_server_domain, "Rejecting connection request; " + "the maximum number of clients ({}) has already been " + "reached.", max_clients); + } else { + if (state.index()) { + auto have_pcm_data = get(state); + clients.emplace_back(std::move(fd), GetEventLoop(), sound_params, + have_pcm_data.pcache); + } else { + clients.emplace_back(std::move(fd), GetEventLoop(), + sound_params); + } + reaper.Schedule(std::chrono::seconds(3)); + } +} diff --git a/src/output/plugins/visualization/VisualizationServer.hxx b/src/output/plugins/visualization/VisualizationServer.hxx new file mode 100644 index 0000000000..d8ebf79146 --- /dev/null +++ b/src/output/plugins/visualization/VisualizationServer.hxx @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright The Music Player Daemon Project + +#ifndef VISUALIZATION_SERVER_HXX_INCLUDED +#define VISUALIZATION_SERVER_HXX_INCLUDED 1 + +#include "VisualizationClient.hxx" + +#include "SoundAnalysis.hxx" + +#include "config/Net.hxx" +#include "event/CoarseTimerEvent.hxx" +#include "event/ServerSocket.hxx" +#include "net/SocketAddress.hxx" +#include "net/UniqueSocketDescriptor.hxx" + +struct AudioFormat; +struct ConfigBlock; + +namespace Visualization { + +class SoundInfoCache; + +/** + * \class VisualizationServer + * + * \brief A socker server handling visualization clients + * + * \sa \ref vis_out_arch "Architecture" + * + * + * This class handles binding one or more sockets & accepting incoming + * connections. For each such incoming connection, it will allocate a + * VisualizationClient instance to represent that client. + * + * The clients require both a PCM data cache and a SoundAnalysis instance to do + * their work. The former must be shared with the plugin that ultimately owns + * this class as well as the VisualizationClient instances, while the latter is + * cheaply copyable and so each client simply gets its own copy. + * + * The problem is that both must know the audio format in use (i.e. the number + * of samples per second and the number of channels), and that is only known + * when the plugin is "opened". Therefore this class can be represented by, yes, + * a finite state machine: + * + \code + + Open --- OnPluginOpened() ---> HavePcmData + ^ | + | | + +---- OnPluginClosed() ----------+ + + \endcode + * + * When a new client connection is opened: + * + * - if we are in state Open, we cannot provide the client with sound analysis + * information nor a reference to the PCM cache + * - if we are in state HavePcmData, we can share a reference to our PCM cache + * along with the salient information needed for sound analysis + * + * On state change: + * + * - from Open to HavePcmData, we can update all extant clients with a + * shared reference to the PCM cache as well as the new sound analysis + * information + * - from HavePcmData to Open, we need to tell all extant clients to + * drop their PCM cache references, as well as their sound analysis + * information + * + * + */ + +class VisualizationServer : public ServerSocket { + + /// only valid when the plugin is open + struct HavePcmData { + // I wish C++ had a `not_null` class + std::shared_ptr pcache; + }; + /// Present state-- v means closed, v means opened (the plugin, that is) + std::variant state; + /// maximum number of clients permitted; zero => unlimited + size_t max_clients; + + /* Clients have both a reference to the PCM cache as well as a + * SoundAnalysis instance while the plugin is opened. We'll create new + * clients with our present state. + * Nb. that VisualizationClient, being a BufferedSocket, is not + * copy constructable, and so must be emplaced. */ + std::list clients; + /// invoked periodically to clean-up dead clients + CoarseTimerEvent reaper; + // Audio analysis parameters + SoundAnalysisParameters sound_params; + +public: + VisualizationServer(EventLoop &event_loop, const char *bind_to_address, + uint16_t port, size_t max_clients, + const SoundAnalysisParameters ¶ms); + + void ReapClients() noexcept; + void OnPluginOpened(const std::shared_ptr &pcache); + void OnPluginClosed(); + +protected: + /* Invoked by `ServerSocket`, on its event loop, when a new client connects + * + * \a fd is the file descriptor of our new socket, \a address is the + * remote address, and \a uid is the effective UID of the client if \a + * fd is a UNIX-domain socket */ + virtual void OnAccept(UniqueSocketDescriptor fd, SocketAddress address, + int uid) noexcept override; + +}; + +} // namespace Visualization + +#endif // VISUALIZATION_SERVER_HXX_INCLUDED diff --git a/src/pcm/AudioFormat.hxx b/src/pcm/AudioFormat.hxx index a7781aa351..108182466b 100644 --- a/src/pcm/AudioFormat.hxx +++ b/src/pcm/AudioFormat.hxx @@ -129,6 +129,10 @@ struct AudioFormat { */ unsigned GetFrameSize() const noexcept; + uint32_t GetSampleRate() const noexcept { + return sample_rate; + } + template constexpr auto TimeToFrames(D t) const noexcept { using Period = typename D::period; diff --git a/test/TestVisualization.cxx b/test/TestVisualization.cxx new file mode 100644 index 0000000000..20c1c75703 --- /dev/null +++ b/test/TestVisualization.cxx @@ -0,0 +1,953 @@ +/* + * Copyright 2003-2022 The Music Player Daemon Project + * http://www.musicpd.org + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "output/plugins/visualization/SoundAnalysis.hxx" +#include "output/plugins/visualization/SoundInfoCache.hxx" +#include "output/plugins/visualization/Protocol.hxx" +#include "util/ByteOrder.hxx" + +#include +#include +#include + +#include +#include +#include +#include + +using namespace Visualization; + +// "Smoke test" for SoundInfoCache +TEST(VisualizationTest, SoundInfoCacheSmoke) +{ + using namespace std; + using namespace std::chrono; + + // Validate a few assumptions I'm making about the API + AudioFormat std_fmt(44100, SampleFormat::S16, 2); + EXPECT_EQ(std_fmt.TimeToSize(seconds(1)), 44100 * 2 * 2); + EXPECT_TRUE(std_fmt.IsFullyDefined()); + EXPECT_TRUE(std_fmt.IsValid()); + EXPECT_EQ(std_fmt.GetFrameSize(), 4); + EXPECT_EQ(std_fmt.GetSampleRate(), 44100); + + // Whip-up an unrealistic, but easy-to-reason-about audio format for testing + // purposes: 1Hz, mono, samples are signed bytes + AudioFormat fmt(1, SampleFormat::S8, 1); + EXPECT_TRUE(fmt.IsFullyDefined()); + EXPECT_TRUE(fmt.IsValid()); + + { + // Silly case-- a cache that can handle exactly three samples + Visualization::SoundInfoCache cache(fmt, seconds(3)); + // Add 2 seconds' worth of data + int8_t data[] = { 1, 2 }; + cache.Add(data, sizeof(data)); + + // I now expect to have the following in my three-slot ring buffer: + // + // +---+---+---+ + // | 1 | 2 | | + // +---+---+---+ + // ^ ^ + // p0 p1 + + EXPECT_EQ(cache.Size(), 2); + + int8_t buf[3]; + bool status = cache.GetFromBeginning(2, buf, sizeof(buf)); + EXPECT_TRUE(status); + EXPECT_EQ(buf[0], 1); + EXPECT_EQ(buf[1], 2); + + data[0] = 3; data[1] = 4; + cache.Add(data, sizeof(data)); + + // I now expect to have the following in my three-slot ring buffer: + // + // +---+---+---+ + // | 4 | 2 | 3 | + // +---+---+---+ + // ^ + // p0,p1 + + EXPECT_EQ(cache.Size(), 3); + + status = cache.GetFromBeginning(3, buf, sizeof(buf)); + EXPECT_TRUE(status); + EXPECT_EQ(buf[0], 2); + EXPECT_EQ(buf[1], 3); + EXPECT_EQ(buf[2], 4); + + data[0] = 5; + cache.Add(data, 1); + + // I now expect to have the following in my three-slot ring buffer: + // + // +---+---+---+ + // | 4 | 5 | 3 | + // +---+---+---+ + // ^ + // p0,p1 + + EXPECT_EQ(cache.Size(), 3); + + status = cache.GetFromBeginning(3, buf, sizeof(buf)); + EXPECT_TRUE(status); + EXPECT_EQ(buf[0], 3); + EXPECT_EQ(buf[1], 4); + EXPECT_EQ(buf[2], 5); + + int8_t data3[] = { 6, 7, 8 }; + cache.Add(data3, 3); + + // I now expect to have the following in my three-slot ring buffer: + // + // +---+---+---+ + // | 7 | 8 | 6 | + // +---+---+---+ + // ^ + // p0,p1 + + EXPECT_EQ(cache.Size(), 3); + + status = cache.GetFromBeginning(3, buf, sizeof(buf)); + EXPECT_TRUE(status); + EXPECT_EQ(buf[0], 6); + EXPECT_EQ(buf[1], 7); + EXPECT_EQ(buf[2], 8); + + int8_t data4[] = { 9, 10, 11, 12 }; + cache.Add(data4, 4); + + // I now expect to have the following in my three-slot ring buffer: + // + // +----+----+----+ + // | 10 | 11 | 12 | + // +----+----+----+ + // ^ + // p0,p1 + + EXPECT_EQ(cache.Size(), 3); + + status = cache.GetFromBeginning(3, buf, sizeof(buf)); + EXPECT_TRUE(status); + EXPECT_EQ(buf[0], 10); + EXPECT_EQ(buf[1], 11); + EXPECT_EQ(buf[2], 12); + } +} + +// Test SoundInfoCache WRT timing +TEST(VisualizationTest, SoundInfoCacheTiming) +{ + using namespace std; + using namespace std::chrono; + + // Whip-up an unrealistic, but easy-to-reason-about audio format for testing purposes: + // 1Hz, mono, samples are signed bytes (i.e. 1 byte per sample + AudioFormat fmt(1, SampleFormat::S8, 1); + EXPECT_TRUE(fmt.IsFullyDefined()); + + // Silly case-- a cache that can handle exactly three samples + Visualization::SoundInfoCache cache(fmt, seconds(3)); + // Add 2 seconds' worth of data + int8_t data[] = { 1, 2 }; + cache.Add(data, sizeof(data)); + + // I now expect to have the following in my three-slot ring buffer: + // + // +---+---+---+ + // | 1 | 2 | | + // +---+---+---+ + // ^ ^ + // p0 p1 + // t0 t1 = t0 + 2 seconds + // + // I don't know what t0 is (it will be different every time this test is + // run), but t1 should be two seconds later than t0. + Visualization::SoundInfoCache::Time t0, t1; + tie(t0, t1) = cache.Range(); + EXPECT_EQ(t1 - t0, seconds(2)); + + int8_t buf[3]; + bool status = cache.GetByTime(2, t1, buf, sizeof(buf)); + EXPECT_TRUE(status); + + EXPECT_EQ(buf[0], 1); + EXPECT_EQ(buf[1], 2); + + // Add 1 second's worth of data + data[0] = 3; + cache.Add(data, 1); + + // I now expect to have the following in my three-slot ring buffer: + // + // +---+---+---+ + // | 1 | 2 | 3 | + // +---+---+---+ + // ^ + // p0, p1 + // t0 + // t1 = t0 + 3 seconds + // + // I don't know what t0 is (it will be different every time this test is + // run), but t1 should be three seconds later than t0. + tie(t0, t1) = cache.Range(); + EXPECT_EQ(t1 - t0, seconds(3)); + + status = cache.GetByTime(3, t1, buf, sizeof(buf)); + EXPECT_TRUE(status); + + EXPECT_EQ(buf[0], 1); + EXPECT_EQ(buf[1], 2); + EXPECT_EQ(buf[2], 3); + + // Add 1 second's worth of data + data[0] = 4; + cache.Add(data, 1); + + // I now expect to have the following in my three-slot ring buffer: + // + // +---+---+---+ + // | 4 | 2 | 3 | + // +---+---+---+ + // ^ + // p0, p1 + // t0 + // t1 = t0 + 3 seconds + // + // I don't know what t0 is (it will be different every time this test is + // run), but t1 should be three seconds later than t0. + tie(t0, t1) = cache.Range(); + EXPECT_EQ(t1 - t0, seconds(3)); + + status = cache.GetByTime(3, t1, buf, sizeof(buf)); + EXPECT_TRUE(status); + + EXPECT_EQ(buf[0], 2); + EXPECT_EQ(buf[1], 3); + EXPECT_EQ(buf[2], 4); + + // Add another second's worth of data + data[0] = 5; + cache.Add(data, 1); + + // I now expect to have the following in my three-slot ring buffer: + // + // +---+---+---+ + // | 4 | 5 | 3 | + // +---+---+---+ + // ^ + // p0, p1 + // t0 + // t1 = t0 + 3 seconds + // + // I don't know what t0 is (it will be different every time this test is + // run), but t1 should be three seconds later than t0. + tie(t0, t1) = cache.Range(); + EXPECT_EQ(t1 - t0, seconds(3)); + + // Add 2 seconds' worth of data + data[0] = 6; data[1] = 7; + cache.Add(data, 2); + + // I now expect to have the following in my three-slot ring buffer: + // + // +---+---+---+ + // | 7 | 5 | 6 | + // +---+---+---+ + // ^ + // p0, p1 + // t0 + // t1 = t0 + 3 seconds + + tie(t0, t1) = cache.Range(); + EXPECT_EQ(t1 - t0, seconds(3)); // 3 secs in the buffer + + // Ask for two samples, ending at `t1` + status = cache.GetByTime(2, t1, buf, sizeof(buf)); + EXPECT_TRUE(status); + + EXPECT_EQ(buf[0], 6); + EXPECT_EQ(buf[1], 7); + + // Let's try fractions-- at this point, we've got 3 second's worth of + // data in the cache, from [t0, t1 = t0 + 3 seconds). + + // What happens if we ask for two samples, ending at t0 + 2500ms? + // TODO(sp1ff): DEBUG + auto d = milliseconds{2500}; // Should be 2500ms = 2.5sec + auto t = t0 + d; + + status = cache.GetByTime(3, t /*t0 + milliseconds{2500}*/, buf, sizeof(buf)); + EXPECT_TRUE(status); + EXPECT_EQ(buf[0], 5); + EXPECT_EQ(buf[1], 6); + EXPECT_EQ(buf[2], 7); + + status = cache.GetByTime(2, t0 + milliseconds(1500), buf, sizeof(buf)); + EXPECT_TRUE(status); + EXPECT_EQ(buf[0], 5); + EXPECT_EQ(buf[1], 6); + + status = cache.GetByTime(1, t0 + milliseconds(500), buf, sizeof(buf)); + EXPECT_TRUE(status); + EXPECT_EQ(buf[0], 5); + + // Negative tests-- what happens if I ask for _two_ samples at t0 + 500ms-- + // we can't satisfy that request + status = cache.GetByTime(2, t0 + milliseconds(500), buf, sizeof(buf)); + EXPECT_FALSE(status); + + // What if I ask for even one sample at t1 + 1ms + status = cache.GetByTime(1, t1 + milliseconds(1), buf, sizeof(buf)); + EXPECT_FALSE(status); +} + +// Exercise SoundInfoCache on a more realistic waveform +TEST(VisualizationTest, Waveform) +{ + using namespace std; + using namespace std::chrono; + + const double TWO_PI = 6.283185307179586476925286766559; + + // Let's generate a waveform for a 1Hz sine wave, sampled at 44100 samples + // per second. Using format 44100:16:2, that's just over 172Kb (i.e. not too + // bad). + AudioFormat fmt(44100, SampleFormat::S16, 2); + EXPECT_TRUE(fmt.IsFullyDefined()); + + int16_t buf[44100 * 2]; + for (int i = 0; i < 44100; ++i) { + double t = (double)i / 44100.0; + int16_t v = (int16_t) (sin(TWO_PI * t) * 32767.0); + buf[i * 2] = buf[i * 2 + 1] = v; + } + + // Create a `SoundInfoCache` instance that can hold 5 seconds' worth of + // such data... + Visualization::SoundInfoCache cache(fmt, seconds(5)); + // and add 6 seconds' worth of data to it. + cache.Add(buf, sizeof(buf)); + Visualization::SoundInfoCache::Time t0, t1; + tie(t0, t1) = cache.Range(); + EXPECT_EQ(t1 - t0, seconds(1)); + cache.Add(buf, sizeof(buf)); + cache.Add(buf, sizeof(buf)); + cache.Add(buf, sizeof(buf)); + cache.Add(buf, sizeof(buf)); + cache.Add(buf, sizeof(buf)); + + // I should now have five seconds' worth of data in the cache. + Visualization::SoundInfoCache::Time t2, t3; + tie(t2, t3) = cache.Range(); + EXPECT_EQ(t3 - t0, seconds(6)); + + // But we're at "song time" = 6 seconds + bool status = cache.GetByTime(100, t0 + seconds(6), buf, sizeof(buf)); + EXPECT_TRUE(status); + + // `buf[0:100]` should now contain the *last* 100 samples + for (int i = 0; i < 100; ++i) { + EXPECT_EQ(buf[2*i], buf[88000 + 2*i]); + } +} + +/** + * \page vis_out_trivial_sample Trivial Waveforms for Testing Purposes + * + * \section vis_out_trivial_sample_intro Introduction + * + * Derivation of a trivial DFT for testing purposes. + * + * \section vis_out_trivial_sample_derivation Derivation + * + * Consider the waveform: + * + \code + 1 + f(x) = sin(x) + - cos(2x) + 2 + \endcode + * + * This function has a (continuous) Fourier transform of: + * + \code + 1 1 + - pi d(w - 2) - i pi d(w - 1) + i pi d(w + 1) + - pi d(w + 2) + 2 2 + \endcode + * + * where \c d denotes the dirac delta function and \c w represents the angular + * momentum. This makes sense: the frequency domain has "spikes" at frequencies + * of 1 & 2 (corresponding to the sin & cos arguments, respectively), and the + * "burst" at a frequency of 1 is twice as strong as that at 2 (corresponding to + * the sin & cos coefficients, resp.). + * + * Let's add a second waveform (so we can simulate stereo): + * + \code + 1 + g(x) = sin(2x) + - cos(4x) + 4 + \endcode + * + * The Fourier transform of \c g is: + * + \code + 1 1 + - pi d(w-4) - i pi d(w-2) + i pi d(w+2) + - pi d(w+4) + 4 4 + \endcode + * + * Similarly: we see spikes at 2 & 4, with the spike at 2 four times the size of + * the spike at 4. + * + * \subsection vis_out_trivial_sample_derivation_octave Gnu Octave Code + * + \code + + octave:1> pkg load symbolic + octave:2> syms x + octave:3> f = sin (x) + 1/2 * cos (2*x) + octave:4> fourier (f) + ans = (sym) + + π⋅δ(w - 2) π⋅δ(w + 2) + ────────── - ⅈ⋅π⋅δ(w - 1) + ⅈ⋅π⋅δ(w + 1) + ────────── + 2 2 + octave:5> g = sin (2*x) + 1/4 * cos (4*x) + octave:6> fourier (g) + ans = (sym) + π⋅δ(w - 4) π⋅δ(w + 4) + ────────── - ⅈ⋅π⋅δ(w - 2) + ⅈ⋅π⋅δ(w + 2) + ────────── + 4 4 + \endcode + * + * \subsection vis_out_trivial_sample_derivation_wolfram Wolfram Language + * + \code + + FourierTransform[Sin[x]+1/2 Cos[2x],x, \[Omega], FourierParameters -> {1,-1}] + = 1/2 \[Pi] DiracDelta[-2+\[Omega]]-I \[Pi] DiracDelta[-1+\[Omega]]+I \[Pi] DiracDelta[1+\[Omega]]+1/2 \[Pi] DiracDelta[2+\[Omega]] + + FourierTransform[Sin[2x]+1/4 Cos[4x],x, \[Omega], FourierParameters -> {1,-1}] + = 1/4 \[Pi] DiracDelta[-4 + \[Omega]] - + I \[Pi] DiracDelta[-2 + \[Omega]] + + I \[Pi] DiracDelta[2 + \[Omega]] + 1/4 \[Pi] DiracDelta[4 + \[Omega]] + + \endcode + * + * \subsection vis_out_trivial_sample_dfts Discrete Fourier Transforms + * + * Let's sample these waveforms at 5 points over the range 0 to 2Pi: that's far + * too low a sampling rate to see much of anything, but it \em is simple enough + * that we can compute the discrete Fourier tranform by hand for testing + * purposes (we'll use a more realistic sampling rate later; right now we just + * want to check our basic calculations). + * + * At the same time, for convenience, let's introduce a transformation so that + * we can tell the codebase that we're sampling once per second (since 2*pi/5 is + * around 1.2566 and AudioFormat only accepts integers for the sample rate). + * Let x = pi * u /2, and we'll work in terms of u: + * + \code + + i u x = u * pi/2 f(y) g(y) + -- - ------------ ---- ---- + 0 0 sec 0 1/2 1/4 + 1 1 Pi/2 1/2 1/4 + 2 2 Pi 1/2 1/4 + 3 3 3*Pi/2 -3/2 1/4 + 4 4 2*Pi 1/2 1/4 + + \endcode + * + * \subsubsection vis_out_trivial_sample_f + * + * Let's work out the Fourier coefficients "by hand". Let the k-th discrete + * Fourier coefficient for f be Y(k) and let the summing index for each + * coefficient be k: + * + \code + + k j => 0 1 2 3 4 + | + v 1 -2pi*0*0*i/5 1 -2pi*1*0*i/5 1 -2pi*2*0*i/5 3 -2pi*3*0*i/5 1 -2pi*4*0*i/5 + Y = - e + - e + - e - - e + - e + 0 0 2 2 2 2 2 + + 1 -2pi*0*1*i/5 1 -2pi*1*1*i/5 1 -2pi*2*1*i/5 3 -2pi*3*1*i/5 1 -2pi*4*1*i/5 + 1 Y = - e + - e + - e - - e + - e + 1 2 2 2 2 2 + + 1 -2pi*0*2*i/5 1 -2pi*1*2*i/5 1 -2pi*2*2*i/5 3 -2pi*3*2*i/5 1 -2pi*4*2*i/5 + 2 Y = - e + - e + - e - - e + - e + 2 2 2 2 2 2 + + 1 -2pi*0*3*i/5 1 -2pi*1*3*i/5 1 -2pi*2*3*i/5 3 -2pi*3*3*i/5 1 -2pi*4*3*i/5 + 3 Y = - e + - e + - e - - e + - e + 3 2 2 2 2 2 + + 1 -2pi*0*4*i/5 1 -2pi*1*4*i/5 1 -2pi*2*4*i/5 3 -2pi*3*4*i/5 1 -2pi*4*4*i/5 + 4 Y = - e + - e + - e - - e + - e + 4 2 2 2 2 2 + + \endcode + * + * OK-- time to let Octave take over: + * + \code + + vpa(1/sym(2)*exp(-sym(2)*sym(pi)*0* 0 *i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*1* 0 *i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*sym(2)* 0 *i/sym(5)) - sym(3)/sym(2)*exp(-sym(2)*sym(pi)*sym(3)* 0* i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*sym(4)* 0 *i/sym(5))) + vpa(1/sym(2)*exp(-sym(2)*sym(pi)*0* 1 *i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*1* 1 *i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*sym(2)* 1 *i/sym(5)) - sym(3)/sym(2)*exp(-sym(2)*sym(pi)*sym(3)* 1* i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*sym(4)* 1 *i/sym(5))) + vpa(1/sym(2)*exp(-sym(2)*sym(pi)*0*sym(2)*i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*1*sym(2)*i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*sym(2)*sym(2)*i/sym(5)) - sym(3)/sym(2)*exp(-sym(2)*sym(pi)*sym(3)*sym(2)*i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*sym(4)*sym(2)*i/sym(5))) + vpa(1/sym(2)*exp(-sym(2)*sym(pi)*0*sym(3)*i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*1*sym(3)*i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*sym(2)*sym(3)*i/sym(5)) - sym(3)/sym(2)*exp(-sym(2)*sym(pi)*sym(3)*sym(3)*i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*sym(4)*sym(3)*i/sym(5))) + vpa(1/sym(2)*exp(-sym(2)*sym(pi)*0*sym(4)*i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*1*sym(4)*i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*sym(2)*sym(4)*i/sym(5)) - sym(3)/sym(2)*exp(-sym(2)*sym(pi)*sym(3)*sym(4)*i/sym(5)) + 1/sym(2)*exp(-sym(2)*sym(pi)*sym(4)*sym(4)*i/sym(5))) + + ans = (sym) 0.5000000000000000000000000000000 + ans = (sym) 1.6180339887498948482045868343656 - 1.1755705045849462583374119092781⋅ⅈ + ans = (sym) -0.61803398874989484820458683436564 + 1.9021130325903071442328786667588⋅ⅈ + ans = (sym) -0.61803398874989484820458683436564 - 1.9021130325903071442328786667588⋅ⅈ + ans = (sym) 1.6180339887498948482045868343656 + 1.1755705045849462583374119092781⋅ⅈ + + \endcode + * + * Let's confirm with Mathematica: + * + \code + + In[5]:= Fourier[{1/2,1/2,1/2,-3/2,1/2}, FourierParameters -> {1,-1}] + Out[5]= {0.5 +0. I,1.61803 -1.17557 I,-0.618034+1.90211 I,-0.618034-1.90211 I,1.61803 +1.17557 I} + + \endcode + * + * \subsubsection vis_out_trivial_sample_g + * + \code + + k j => 0 1 2 3 4 + | + v 1 -2pi*0*0*i/5 1 -2pi*1*0*i/5 1 -2pi*2*0*i/5 1 -2pi*3*0*i/5 1 -2pi*4*0*i/5 + Y = - e + - e + - e + - e + - e + 0 0 4 4 4 4 4 + + 1 -2pi*0*1*i/5 1 -2pi*1*1*i/5 1 -2pi*2*1*i/5 1 -2pi*3*1*i/5 1 -2pi*4*1*i/5 + 1 Y = - e + - e + - e + - e + - e + 1 4 4 4 4 4 + + 1 -2pi*0*2*i/5 1 -2pi*1*2*i/5 1 -2pi*2*2*i/5 1 -2pi*3*2*i/5 1 -2pi*4*2*i/5 + 2 Y = - e + - e + - e + - e + - e + 2 4 4 4 4 4 + + 1 -2pi*0*3*i/5 1 -2pi*1*3*i/5 1 -2pi*2*3*i/5 1 -2pi*3*3*i/5 1 -2pi*4*3*i/5 + 3 Y = - e + - e + - e + - e + - e + 3 4 4 4 4 4 + + 1 -2pi*0*4*i/5 1 -2pi*1*4*i/5 1 -2pi*2*4*i/5 1 -2pi*3*4*i/5 1 -2pi*4*4*i/5 + 4 Y = - e + - e + - e + - e + - e + 4 4 4 4 4 4 + + \endcode + * + \code + + vpa(1/sym(4)*exp(-sym(2)*sym(pi)*0* 0 *i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*1* 0 *i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(2)* 0 *i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(3)* 0* i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(4)* 0 *i/sym(5))) + vpa(1/sym(4)*exp(-sym(2)*sym(pi)*0* 1 *i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*1* 1 *i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(2)* 1 *i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(3)* 1* i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(4)* 1 *i/sym(5))) + vpa(1/sym(4)*exp(-sym(2)*sym(pi)*0*sym(2)*i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*1*sym(2)*i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(2)*sym(2)*i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(3)*sym(2)*i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(4)*sym(2)*i/sym(5))) + vpa(1/sym(4)*exp(-sym(2)*sym(pi)*0*sym(3)*i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*1*sym(3)*i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(2)*sym(3)*i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(3)*sym(3)*i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(4)*sym(3)*i/sym(5))) + vpa(1/sym(4)*exp(-sym(2)*sym(pi)*0*sym(4)*i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*1*sym(4)*i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(2)*sym(4)*i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(3)*sym(4)*i/sym(5)) + 1/sym(4)*exp(-sym(2)*sym(pi)*sym(4)*sym(4)*i/sym(5))) + + ans = (sym) 1.2500000000000000000000000000000 + ans = (sym) 0.e-142 + 0.e-142⋅ⅈ + ans = (sym) 0.e-142 + 0.e-142⋅ⅈ + ans = (sym) 0.e-142 + 0.e-142⋅ⅈ + ans = (sym) 0.e-142 + 0.e-142⋅ⅈ + + \endcode + * + * Again, let's confirm with Mathematica: + * + \code + + In[6]:= Fourier[{1/4,1/4,1/4,1/4,1/4}, FourierParameters -> {1,-1}] + Out[6]= {1.25,5.55112*10^-17,5.55112*10^-17,5.55112*10^-17,5.55112*10^-17} + + \endcode + * + * + */ + +// Read the four bytes at \p as a float in the network protocol +inline float float_at(std::byte *p, size_t i) { + uint32_t as_uint = FromBE32(*(uint32_t*)(p + 4*i)); + return *(float*)&as_uint; +} + +// Test SoundAnalaysis against a trivial DFT +TEST(VisualizationTest, TrivialDft) +{ + using namespace std::chrono; + + // Let's represent our wave form as IEEE 754 single precisions floats, + // sampled once per second, with two channels (i.e. stereo). + AudioFormat fmt(1, SampleFormat::FLOAT, 2); + + // Sanity check-- 20 bytes is 5 samples, which should be five seconds' + // worth. Double for the two channels + Visualization::SoundInfoCache::Duration us = fmt.SizeToTime(40); + EXPECT_EQ(us, seconds(5)); + + constexpr float samples[10] = { 0.5, 0.25, 0.5, 0.25, 0.5, 0.25, -1.5, 0.25, 0.5, 0.25 }; + std::shared_ptr pcache = + std::make_unique(fmt, seconds(6)); // six seconds' capacity, just so we + // don't need to worry + pcache->Add(samples, sizeof(samples)); + EXPECT_EQ(pcache->Size(), 40); + + Visualization::SoundInfoCache::Time t0, t1; + std::tie(t0, t1) = pcache->Range(); + // `t0` is whatever time the first sample was added; what we know is that + // `t1` should be five seconds later. + auto d = t1 - t0; + EXPECT_EQ(d, seconds(5)) << "t0 is " << t0 << ", t1 is " << t1 << ", d is " << d; + + // For each channel, we'll get back five Fourier coefficients, corresponding + // to the frequencies 0Hz, 1/5Hz, 2/5, 3/5 & 4/5. Let's pick cuttoffs that + // will discard the highest & the lowest, just for testing purposes. + SoundAnalysisParameters params { 5, 0.25, 0.75 }; + SoundAnalysis analysis(params, pcache); + + EXPECT_EQ(2, analysis.NumChan()); + EXPECT_EQ(5, analysis.NumSamp()); + EXPECT_EQ(3, analysis.NumFreq()); + + EXPECT_TRUE(analysis.Update(t1)); + + // Three coefficients per channel, two channels + fftwf_complex coeffs[6]; + analysis.GetCoeffs(coeffs, sizeof(coeffs)); + + EXPECT_FLOAT_EQ(coeffs[0][0], 0.5); + EXPECT_FLOAT_EQ(coeffs[0][1], 0.0); + EXPECT_FLOAT_EQ(coeffs[1][0], 1.6180339887498948482045868343656); + EXPECT_FLOAT_EQ(coeffs[1][1], -1.1755705045849462583374119092781); + EXPECT_FLOAT_EQ(coeffs[2][0], -0.61803398874989484820458683436564); + EXPECT_FLOAT_EQ(coeffs[2][1], 1.9021130325903071442328786667588); + EXPECT_FLOAT_EQ(coeffs[3][0], 1.25); + EXPECT_FLOAT_EQ(coeffs[3][1], 0.0); + EXPECT_FLOAT_EQ(coeffs[4][0], 0.0); + EXPECT_FLOAT_EQ(coeffs[4][0], 0.0); + EXPECT_FLOAT_EQ(coeffs[5][0], 0.0); + EXPECT_FLOAT_EQ(coeffs[5][0], 0.0); + + // bass/mids/trebs: 0/2/4 (left) + // bass/mids/trebs: 0/0/0 (right) + + float bmt[6]; + EXPECT_TRUE(analysis.GetBassMidsTrebs(bmt, 6)); + + EXPECT_FLOAT_EQ(bmt[0], 0.0); + EXPECT_FLOAT_EQ(bmt[1], 2.0); + EXPECT_FLOAT_EQ(bmt[2], 4.0); + EXPECT_FLOAT_EQ(bmt[3], 0.0); + EXPECT_FLOAT_EQ(bmt[4], 0.0); + EXPECT_FLOAT_EQ(bmt[5], 0.0); + + // Serialization: + // + // +----------+----------+-------------+-----------+----------+---------+---------+----------+------------+---------------+-----------------+ + // | num_samp | num_chan | sample_rate | waveforms | num_freq | freq_lo | freq_hi | freq_off | coeffs | power_spectra | bass/mids/trebs | + // | -------- | -------- | ----------- | --------- | -------- | ------- | ------- | -------- | ---------- | ------------- | --------------- | + // | uint16_t | uint8_t | uint16_t | see below | uint16_t | float | float | uint16_t | see below | see below | see below | + // | 0005 | 02 | 0001 | | 003 | 0.25 | 0.75 | 0001 | | | | + // +----------+----------+-------------+-----------+----------+---------+---------+----------+------------+---------------+-----------------+ + // 2 1 2 40 2 4 4 2 48 24 24 + // 153 octets, total + + // waveforms: + // chan 0: 0.5, 0.5 0.5 -1.5, 0.5 + // chan 1: 0.25 0.25 0.25, 0.25, 0.25 + + // coeffs: + // chan 0: (1.6180339887498948482045868343656, -1.1755705045849462583374119092781), (-0.61803398874989484820458683436564, 1.9021130325903071442328786667588) (-0.61803398874989484820458683436564, 1.9021130325903071442328786667588) + // chan 1: (0.0, 0.0) (0.0, 0.0) (0.0, 0.0) + + // spectra: + // chan 0: 2, 2, 2 + // chan 1: 0, 0, 0 + + std::byte buf[153]; + std::byte *p1 = analysis.SerializeSoundInfoFramePayload(buf); + std::byte *p0 = buf; + EXPECT_EQ(p1, p0 + 153); + + EXPECT_EQ(FromBE16(*(uint16_t*)p0), 5); p0 += 2; // num_samp := 5 + EXPECT_EQ(*p0, (std::byte)2); p0 += 1; // num_chan := 2 + EXPECT_EQ(FromBE16(*(uint16_t*)p0), 1); p0 += 2; // sample_rate := 1 + + // waveform, channel 0 + EXPECT_FLOAT_EQ(float_at(p0, 0), 0.5); + EXPECT_FLOAT_EQ(float_at(p0, 1), 0.5); + EXPECT_FLOAT_EQ(float_at(p0, 2), 0.5); + EXPECT_FLOAT_EQ(float_at(p0, 3), -1.5); + EXPECT_FLOAT_EQ(float_at(p0, 4), 0.5); + p0 += 20; + + // waveform, channel 1 + EXPECT_FLOAT_EQ(float_at(p0, 0), 0.25); + EXPECT_FLOAT_EQ(float_at(p0, 1), 0.25); + EXPECT_FLOAT_EQ(float_at(p0, 2), 0.25); + EXPECT_FLOAT_EQ(float_at(p0, 3), 0.25); + EXPECT_FLOAT_EQ(float_at(p0, 4), 0.25); + p0 += 20; + + EXPECT_EQ(FromBE16(*(uint16_t*)p0), 3); p0 += 2; // num_freq := 3 + + EXPECT_FLOAT_EQ(float_at(p0, 0), 0.25); // freq_lo + EXPECT_FLOAT_EQ(float_at(p0, 1), 0.75); // freq_hi + p0 += 8; + + EXPECT_EQ(FromBE16(*(uint16_t*)p0), 1); p0 += 2; // freq_off + + // coefficients, channel 0 + EXPECT_FLOAT_EQ(float_at(p0, 0), 1.6180339887498948482045868343656); + EXPECT_FLOAT_EQ(float_at(p0, 1), -1.1755705045849462583374119092781); + EXPECT_FLOAT_EQ(float_at(p0, 2), -0.61803398874989484820458683436564); + EXPECT_FLOAT_EQ(float_at(p0, 3), 1.9021130325903071442328786667588); + EXPECT_FLOAT_EQ(float_at(p0, 4), -0.61803398874989484820458683436564); + EXPECT_FLOAT_EQ(float_at(p0, 5), -1.9021130325903071442328786667588); + p0 += 24; + + // For small quantities, absolute error is more reliable than relative + const float ZERO_THRESH = 1.0e-43f; + + // coefficients, channel 1 + EXPECT_NEAR(float_at(p0, 0), 0.0, ZERO_THRESH); + EXPECT_NEAR(float_at(p0, 1), 0.0, ZERO_THRESH); + EXPECT_NEAR(float_at(p0, 2), 0.0, ZERO_THRESH); + EXPECT_NEAR(float_at(p0, 3), 0.0, ZERO_THRESH); + EXPECT_NEAR(float_at(p0, 4), 0.0, ZERO_THRESH); + EXPECT_NEAR(float_at(p0, 5), 0.0, ZERO_THRESH); + p0 += 24; + + EXPECT_NEAR(float_at(p0, 0), 2.0, ZERO_THRESH); + EXPECT_NEAR(float_at(p0, 1), 2.0, ZERO_THRESH); + EXPECT_NEAR(float_at(p0, 2), 2.0, ZERO_THRESH); + p0 += 12; + + EXPECT_NEAR(float_at(p0, 0), 0.0, ZERO_THRESH); + EXPECT_NEAR(float_at(p0, 1), 0.0, ZERO_THRESH); + EXPECT_NEAR(float_at(p0, 2), 0.0, ZERO_THRESH); + p0 += 12; + + // bass/mids/trebs + + EXPECT_NEAR(float_at(p0, 0), 0.0, ZERO_THRESH); + EXPECT_NEAR(float_at(p0, 1), 2.0, ZERO_THRESH); + EXPECT_NEAR(float_at(p0, 2), 4.0, ZERO_THRESH); + p0 += 12; + + EXPECT_NEAR(float_at(p0, 0), 0.0, ZERO_THRESH); + EXPECT_NEAR(float_at(p0, 1), 0.0, ZERO_THRESH); + EXPECT_NEAR(float_at(p0, 2), 0.0, ZERO_THRESH); + p0 += 12; +} + +// Now let's try a more realistic sampling rate +TEST(VisualizationTest, SinesAndCosines) +{ + using namespace std::chrono; + + const float TWO = 2.f; + const float FOUR = 4.f; + + // Everything below is driven off `NUM_SAMP`-- the higher this number is, + // the closer we'll get to a dirac delta function at these functions' + // frequencies. + const size_t NUM_SAMP = /*101*/ /*513*/ 1025; + + const size_t NUM_COEFF = (NUM_SAMP / 2) + 1; + const size_t SAMPLE_RATE_HZ = size_t((float)NUM_SAMP / 6.28318531f) + 1; + + // Just for fun (and better test coverage) we'll represent our waveforms as + // signed 16-bit integers, sampled at ceil(num_samp/2/Pi,) with two channels + // (i.e. stereo). + AudioFormat fmt(SAMPLE_RATE_HZ, SampleFormat::S16, 2); + + // Let's sample over the entire period of these functions (2Pi =~ 6.28) + std::shared_ptr pcache = + std::make_shared(fmt, seconds(7)); + + // Sample the functions over all of [0, 2*Pi), so the DFT has a chance + // to "see" all the frequencies in one period of each function. + int16_t samples[SAMPLE_RATE_HZ * 2]; + // We sample the waveforms one second at a time, filling-up the cache as we + // go: + for (size_t i = 0; i < 7; ++i) { + for (size_t j = 0; j < SAMPLE_RATE_HZ; ++j) { + float x = (float)i + float(j) / (float)SAMPLE_RATE_HZ; + float f = sin(x) + cos(TWO * x) / TWO; + float g = sin(TWO * x) + cos(FOUR * x) / FOUR; + + // -1.5 <= f <= 0.75 (approx), & -1.25 <= g <= 0.75 (approx), so + // -let's scale f & g. + samples[2 * j ] = (int16_t)(f * 1024.f); + samples[2 * j + 1] = (int16_t)(g * 1024.f); + } + pcache->Add(samples, sizeof(samples)); + } + + Visualization::SoundInfoCache::Time t0, t1; + std::tie(t0, t1) = pcache->Range(); + + // Quick sanity check-- `t0` is whatever time the first sample was added; + // what we *do* know is that `t1` should be seven seconds later. + auto d = t1 - t0; + EXPECT_EQ(d, seconds(7)) << "t0 is " << t0 << ", t1 is " << t1 << ", d is " << d; + + // OK-- compute the DFT: + SoundAnalysisParameters params(NUM_SAMP, 0.f, 20000.f); + SoundAnalysis analysis(params, pcache); + + EXPECT_TRUE(analysis.Update(t1)); + fftwf_complex coeffs[2 * NUM_COEFF]; + EXPECT_TRUE(analysis.GetCoeffs(coeffs, sizeof(coeffs))); + + float spectra[2 * NUM_COEFF]; + for (size_t i = 0; i < NUM_COEFF; ++i) { + float mag_left = sqrt(coeffs[i][0] * coeffs[i][0] + coeffs[i][1] * coeffs[i][1]); + spectra[i] = mag_left > 1.0f ? mag_left : 0.f; // threshold + + float mag_right = sqrt(coeffs[NUM_COEFF + i][0] * coeffs[NUM_COEFF + i][0] + + coeffs[NUM_COEFF + i][1] * coeffs[NUM_COEFF + i][1]); + spectra[NUM_COEFF + i] = mag_right > 1.0f ? mag_right : 0.f; // threshold + } + + // left: should see frequency at coeff 1 & coeff 2 (half as big as one) + float abs_err = spectra[1] / 50.f; + EXPECT_NEAR(spectra[1], TWO * spectra[2], abs_err); + + float thresh = spectra[1] / 50.f; + for (size_t i = 0; i < NUM_COEFF; ++i) { + if (i != 1 && i != 2) { + EXPECT_TRUE(spectra[i] < thresh) + << "i is " << i << ", threshold is " << thresh << + ", spectra[i] is " << spectra[i]; + } + } + + // right: should see 'em at 2 & 4 (the one at 4 being one-quarter the size) + abs_err = spectra[NUM_COEFF + 2] / 50.f; + EXPECT_NEAR(spectra[NUM_COEFF + 2], FOUR * spectra[NUM_COEFF + 4], abs_err); + thresh = spectra[NUM_COEFF + 2] /50.f; + for (size_t i = 0; i < NUM_COEFF; ++i) { + if (i != 2 && i != 4) { + EXPECT_TRUE(spectra[NUM_COEFF + i] < thresh) + << "i is " << i << ", threshold is " << thresh << + ", spectra[NUM_COEFF + i] is " << spectra[NUM_COEFF + i]; + } + } + +} + +// Network protocol -- deserialization +TEST(VisualizationTest, TestDeCliHlo) +{ + ClientHello clihlo; + uint8_t incomplete_buf_0[] = { 0x00 }; + EXPECT_EQ(ParseResult::NEED_MORE_DATA, + ParseClihlo(incomplete_buf_0, sizeof(incomplete_buf_0), clihlo)); + + // Correct message type, length is zero + uint8_t incomplete_buf_1[] = { 0x00, 0x00, 0x00, 0x00 }; + EXPECT_EQ(ParseResult::NEED_MORE_DATA, + ParseClihlo(incomplete_buf_1, sizeof(incomplete_buf_1), clihlo)); + + // Correct message type, length is correct, payload is incomplete + uint8_t incomplete_buf_2[] = { 0x00, 0x00, 0x00, 0x06, 0x00, 0x01, 0x00, 0x20 }; + EXPECT_EQ(ParseResult::NEED_MORE_DATA, + ParseClihlo(incomplete_buf_2, sizeof(incomplete_buf_2), clihlo)); + + // Correct message type, length is correct, missing "check byte" + uint8_t incomplete_buf_3[] = { + 0x00, 0x00, + 0x00, 0x06, + 0x00, 0x01, + 0x00, 0x20, + 0x00, 0xff + }; + EXPECT_EQ(ParseResult::NEED_MORE_DATA, + ParseClihlo(incomplete_buf_3, sizeof(incomplete_buf_3), clihlo)); + + // Correct message, except the length is incorrect + uint8_t incomplete_buf_4[] = { + 0x00, 0x00, + 0x00, 0x05, + 0x00, 0x01, + 0x00, 0x20, + 0x00, 0xff + }; + EXPECT_EQ(ParseResult::NEED_MORE_DATA, + ParseClihlo(incomplete_buf_4, sizeof(incomplete_buf_4), clihlo)); + + // Finally correct + uint8_t complete_buf_0[] = { + 0x00, 0x00, + 0x00, 0x06, + 0x00, 0x01, + 0x00, 0x20, + 0x00, 0xff, + 0x00 + }; + EXPECT_EQ(ParseResult::OK, + ParseClihlo(complete_buf_0, sizeof(complete_buf_0), clihlo)); + + EXPECT_EQ(clihlo.major_version, 0); + EXPECT_EQ(clihlo.minor_version, 1); + EXPECT_EQ(clihlo.requested_fps, 32); + EXPECT_EQ(clihlo.tau, 255); +} + +// Network protocol -- serialization +TEST(VisualizationTest, TestSerSrvHlo) +{ + using std::byte; + + byte buf[] = { + (byte)0x00, (byte)0x00, // type + (byte)0x00, (byte)0x00, // length + (byte)0x00, (byte)0x00, // payload + (byte)0x00, // check + (byte)0xaa // tombstone + }; + + SerializeSrvhlo((byte)3, (byte)2, buf); + + ASSERT_EQ(buf[0], (byte)0x00); + ASSERT_EQ(buf[1], (byte)0x01); + ASSERT_EQ(buf[2], (byte)0x00); + ASSERT_EQ(buf[3], (byte)0x02); + ASSERT_EQ(buf[4], (byte)0x03); + ASSERT_EQ(buf[5], (byte)0x02); + ASSERT_EQ(buf[6], (byte)0x00); + ASSERT_EQ(buf[7], (byte)0xaa); +} diff --git a/test/meson.build b/test/meson.build index 52833feea7..0840beb9a3 100644 --- a/test/meson.build +++ b/test/meson.build @@ -635,3 +635,38 @@ if alsa_dep.found() endif subdir('fs') + +# +# Visualization Output +# + +test( + 'test_vis', + executable( + 'test_vis', + 'TestVisualization.cxx', + include_directories: inc, + dependencies: [ + output_plugins_dep, + gtest_dep, + ], + ), + protocol: 'gtest', +) + +# +# Visualization client +# + +executable( + 'run_vis', + 'run_vis.cxx', + include_directories: inc, + dependencies: [ + output_registry_dep, + encoder_glue_dep, + event_dep, + cmdline_dep, + ], +) + diff --git a/test/run_vis.cxx b/test/run_vis.cxx new file mode 100644 index 0000000000..b7e13d09ad --- /dev/null +++ b/test/run_vis.cxx @@ -0,0 +1,336 @@ +#include "net/SocketAddress.hxx" +#include "net/SocketDescriptor.hxx" +#include "util/ByteOrder.hxx" +#include "util/PrintException.hxx" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using std::byte; +using std::make_tuple; +using std::span; + +class CliError : public std::runtime_error +{ +public: + CliError(const char *pmsg) : std::runtime_error(pmsg) + { } + CliError(const std::string &msg) : std::runtime_error(msg) + { } +}; + +/// Parse the command line, return our parameters +static std::tuple +ParseCl(int argc, char **argv) +{ + if (5 != argc) { + throw CliError("Four arguments expected"); + } + + uint16_t port = atoi(argv[2]); + if (0 == port) { + throw CliError("Couldn't parse port"); + } + + uint16_t fps = atoi(argv[3]); + if (0 == fps) { + throw CliError("Couldn't parse fps"); + } + + int16_t tau = atoi(argv[4]); + // Arghhh... no way to distinguish between "0" and error + + return std::make_tuple(argv[1], port, fps, tau); +} + +/// Connect to the MPD visualization server +static std::variant +Connect(const std::string &host, uint16_t port) +{ + struct sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_port = htons(port); + + if (0 >= inet_aton(host.c_str(), &addr.sin_addr)) { + std::string msg = "Failed to parse '" + host + "' as a hostname (" + + strerror(errno) + ")"; + throw CliError(msg); + } + + SocketAddress sock_addr((const struct sockaddr*)&addr, sizeof(addr)); + + SocketDescriptor sock; + if (!sock.Create(AF_INET, SOCK_STREAM, 0)) { + throw std::runtime_error("Faled to 'Create' the SocketDescriptor."); + } + + if (sock.Connect(sock_addr)) { + return sock; + } + + return std::monostate { }; +} + +static std::tuple +Handshake(SocketDescriptor &sock, uint16_t fps, int16_t tau) +{ + static byte buf[11] = { + byte{0x00}, byte{0x00}, // message type + byte{0x00}, byte{0x06}, // payload length + byte{0x00}, byte{0x01}, // request version 0.1 + }; + + uint16_t fpsn = htons(fps); + int16_t taun = htons(tau); + std::copy((byte*)&fpsn, (byte*)(&fpsn + 2), buf + 6); + std::copy((byte*)&taun, (byte*)(&taun + 2), buf + 8); + buf[10] = byte{0}; + + ssize_t cb = sock.Write(span(buf, buf+ sizeof(buf))); + if (0 >= cb) { + throw std::runtime_error(strerror(errno)); + } + if (cb != sizeof(buf)) { + throw std::runtime_error("Incomplete write."); + } + + cb = sock.Read(span(buf, buf + sizeof(buf))); + if (0 >= cb) { + throw std::runtime_error(strerror(errno)); + } + + byte *p = buf; + uint16_t msgtype = FromBE16(*(uint16_t *)p); p += 2; + if (0x0001 != msgtype) { + throw std::runtime_error("Unexpected message type"); + } + + uint16_t msglen = ntohs(*(uint16_t *)p); p += 2; + if (0x0002 != msglen) { + throw std::runtime_error("Unexpected message length"); + } + + uint8_t proto_ver_major = (uint8_t)*p++; + uint8_t proto_ver_minor = (uint8_t)*p++; + + return make_tuple(proto_ver_major, proto_ver_minor); +} + +/// Listen for FRAME messages, print-out bass/mids/trebs +static void +Listen(SocketDescriptor &sock) +{ + using namespace std; + using namespace std::chrono; + + byte buf[8192]; + + // this will hold num_chan * 8 floats for to compute a weighted average of + // recent bass values-- will initialize on first FRAME + vector bass; + // index of the "next" slot for a bass value + size_t bass_idx = 0; + + const float WEIGHTS[] = { 1.67772f, 2.09715f, 2.62144f, 3.2768f, 4.096f, 5.12f, 6.4f, 8.0f }; + + for (size_t i = 0; ; ++i) { + ssize_t cb = sock.Read(span(buf, buf + sizeof(buf))); + if (0 >= cb) { + if (0 == errno) { + cout << "MPD went away." << endl; + return; + } + throw std::runtime_error(strerror(errno)); + } + + std::time_t now; + if ((std::time_t)-1 == std::time(&now)) { + throw std::runtime_error(strerror(errno)); + } + + if (cb == sizeof(buf)) { + throw std::runtime_error("Buffer overflow!") ; + } + + // Hmmm... let's begin parsing (tho I think for now I'll just be + // interested in bass/mids/trebs as a crude manual test). + byte *p = buf; + + uint32_t sentinel = ntohl(*(uint32_t *)p); + p += 4; + if (0x63ac8403 != sentinel) { + throw std::runtime_error("Missing sentinel!"); + } + + uint16_t msg_type = FromBE16(*(uint16_t*)p); p += 2; + if (0x1000 != msg_type) { + stringstream stm; + stm << "Unexpected message type 0x" << hex << msg_type << "!"; + throw std::runtime_error(stm.str()); + } + + uint16_t msg_len = FromBE16(*(uint16_t*)p); p += 2; + uint16_t num_samp = FromBE16(*(uint16_t*)p); p += 2; + uint8_t num_chan = *(uint8_t*)p; p += 1; + /*uint16_t sample_rate = FromBE16(*(uint16_t*)p);*/ p += 2; + + if (0 == bass.size()) { + bass.resize(num_chan * 8, 0.0f); + } + + // Skip over waveforms for now! + p += num_samp * num_chan * 4; + + uint16_t num_freq = FromBE16(*(uint16_t*)p); p += 2; + /*uint32_t tmp = ntohl(*(uint32_t *)p);*/ p += 4; + /*float freq_lo = *(float*)&tmp;*/ + /*tmp = ntohl(*(uint32_t *)p);*/ p += 4; + /*float freq_hi = *(float*)&tmp;*/ + + /*uint16_t freq_off = FromBE16(*(uint16_t*)p);*/ p += 2; + + // Let's skip the Fourier coefficients.... + p += num_chan * num_freq * 8; + // as well as the power spectra + p += num_chan * num_freq * 4; + + auto now_ms = duration_cast(system_clock::now().time_since_epoch()); + cout << put_time(gmtime(&now), "%c %Z") << ": [" << + now_ms.count() << "](" << + msg_len << "bytes) "; + + // OK-- let's just grab bass/mids/trebs for each channel. + float mean_bass = 0.0f, mean_mids = 0.0f, mean_trebs = 0.0f; + for (uint8_t j = 0; j < num_chan; ++j) { + + if (j) { + cout << " "; + } + + uint32_t tmp = ntohl(*(uint32_t *)p); p += 4; + float this_bass = *(float*)&tmp; + tmp = ntohl(*(uint32_t *)p); p += 4; + float this_mids = *(float*)&tmp; + tmp = ntohl(*(uint32_t *)p); p += 4; + float this_trebs = *(float*)&tmp; + + mean_bass += this_bass; + mean_mids += this_mids; + mean_trebs += this_trebs; + + // record the in this channel for use below in beat detection + bass[j * 8 + bass_idx] = this_bass; + + cout << this_bass << "/" << this_mids << "/" << this_trebs; + } + + cout << " "; + + mean_bass /= (float) num_chan; + mean_mids /= (float) num_chan; + mean_trebs /= (float) num_chan; + + // beat detection-- very crude. We'll compute a weighted average of the + // bass in each channel. Note that this caclulation will be incorrect + // for the first seven frames-- meh 🤷 + float weighted_mean_bass = 0.0f; + for (uint8_t j = 0; j < num_chan; ++j) { + + if (j) { + cout << "/"; + } + + // Given the way we're indexing, the weighted sum will come in two + // parts: + + // the first will be bass[bass_idx]*WEIGHTS[7] + ... + bass[0]*WEIGHTS[7-bass_idx] + + // the second will be bass[bass_idx+1]*WEIGHTS[0] + ... + bass[7]*WEIGHTS[6-idx] + // when idx < 7 + + float weighted_mean = 0.0f; + for (ptrdiff_t k = bass_idx, n = 0; k >= 0; --k, ++n) { + weighted_mean += bass[j*8+k] * WEIGHTS[7-n]; + } + if (bass_idx < 7) { + for (size_t k = bass_idx+1, n = 0; k < 8; ++k, ++n) { + weighted_mean += bass[j*8+k] * WEIGHTS[n]; + } + } + + weighted_mean /= 33.2891f; // Sum of weights + + cout << weighted_mean; + + weighted_mean_bass += weighted_mean; + } + + bass_idx = (bass_idx + 1) % 8; + + cout << " "; + + // `weighted_mean_bass` is the average weighted average of the bass across + // all channels-- this is what we use for our signal. + weighted_mean_bass /= (float)num_chan; + + float thresh = weighted_mean_bass * 0.325f; + if ((mean_bass - weighted_mean_bass) > thresh) { + cout << " BEAT DETECTED"; + } + cout << endl; + } +} + +/// Testing client for the visualization output plugin +/// Invoke as `run_vis mpd-host port fps time-offset` +int main(int argc, char **argv) { + using namespace std; + + try { + string mpd_host; + int16_t tau; + uint16_t port, fps; + tie(mpd_host, port, fps, tau) = ParseCl(argc, argv); + + while (true) { + + auto conn = Connect(mpd_host, port); + if (0 == conn.index()) { + cout << "Failed to connect; sleeping for fifteen seconds & retrying (hit Ctrl-C to exit)." << endl; + std::this_thread::sleep_for(15000ms); + continue; + } + + auto sock = std::get(conn); + cout << "Connected." << endl; + + uint8_t major, minor; + tie(major, minor) = Handshake(sock, fps, tau); + cout << "Received protocol version " << (int)major << + "." << (int)minor << "." << endl; + + Listen(sock); + cout << "Sleeping for thirty seconds & retrying (hit Ctrl-C to exit)." << endl; + std::this_thread::sleep_for(30000ms); + } + } catch (const CliError &ex) { + PrintException(ex); + return 2; + } catch (...) { + PrintException(std::current_exception()); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +}