diff --git a/_static/mumble.css b/_static/mumble.css
index 19bd6038c0f..b1a7308c5e5 100644
--- a/_static/mumble.css
+++ b/_static/mumble.css
@@ -4,4 +4,16 @@
 
 th {
 	background-color: #adadad;
-}
\ No newline at end of file
+}
+
+table.bits8 {
+	text-align: center;
+	table-layout: fixed;
+	width: 300px;
+}
+
+table.bits16 {
+	text-align: center;
+	table-layout: fixed;
+}
+
diff --git a/conf.py b/conf.py
index f75f45cb6d7..c4400e4544a 100644
--- a/conf.py
+++ b/conf.py
@@ -2,6 +2,9 @@
 
 import sys, os
 
+# on_rtd is whether we are on readthedocs.org
+on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
+
 extensions = [
 	'sphinx.ext.pngmath',
 ]
@@ -32,16 +35,22 @@
 
 pygments_style = 'sphinx'
 
-html_theme = 'default'
-html_theme_options = {
-	'footerbgcolor':    '#555555',
-	'relbarbgcolor':    '#222222',
-	'sidebarbgcolor':   '#333333',
-	'linkcolor':        '#696969',
-	'visitedlinkcolor': '#adadad',
-	'sidebarlinkcolor': '#cacaca',
-	'headtextcolor':    '#000000',
-}
+if not on_rtd:  # only import and set the theme if we're building docs locally
+    import sphinx_rtd_theme
+    html_theme = 'sphinx_rtd_theme'
+    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+else:
+    html_theme = 'default'
+
+    html_theme_options = {
+            'footerbgcolor':    '#555555',
+            'relbarbgcolor':    '#222222',
+            'sidebarbgcolor':   '#333333',
+            'linkcolor':        '#696969',
+            'visitedlinkcolor': '#adadad',
+            'sidebarlinkcolor': '#cacaca',
+            'headtextcolor':    '#000000',
+    }
 
 # Add any paths that contain custom themes here, relative to this directory.
 #html_theme_path = []
@@ -80,4 +89,4 @@
 html_domain_indices = False
 
 html_show_sphinx = True
-html_show_copyright = True
\ No newline at end of file
+html_show_copyright = True
diff --git a/index.rst b/index.rst
index 0f0f24848b4..84771136f2a 100644
--- a/index.rst
+++ b/index.rst
@@ -1,10 +1,11 @@
 Mumble Protocol Documentation
 =============================
 
-Contents:
+Contents
+--------
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 3
 
    introduction
    overview
@@ -13,8 +14,10 @@ Contents:
    voice_data
 
 
-Indices and tables
-==================
+.. # These are empty pages:
 
-* :ref:`genindex`
-* :ref:`search`
\ No newline at end of file
+.. # Indices and tables
+.. # ==================
+
+.. # * :ref:`genindex`
+.. # * :ref:`search`
diff --git a/introduction.rst b/introduction.rst
index 94187a4162f..02018d765b9 100644
--- a/introduction.rst
+++ b/introduction.rst
@@ -2,8 +2,9 @@ Introduction
 ============
 
 This document is meant to be a reference for the Mumble VoIP 1.2.X
-server-client communication protocol.  It reflects the state of the
-protocol implemented in the Mumble 1.2.2 client and might be outdated
-by the time you are reading this. Be sure to check for newer revisions
-of this document on our website \url{http://www.mumble.info}. At the
-moment this document is work in progress.
\ No newline at end of file
+server-client communication protocol. It reflects the state of the protocol
+implemented in the Mumble 1.2.8 client and might be outdated by the time you
+are reading this. Be sure to check for newer revisions of this document at
+http://mumble-protocol.readthedocs.org/.
+
+This document is a constant work in progress.
diff --git a/voice_data.rst b/voice_data.rst
index 358dce06f1e..8325b8fe027 100644
--- a/voice_data.rst
+++ b/voice_data.rst
@@ -3,176 +3,286 @@
 Voice data
 ==========
 
-.. _enable-udp-channel:
+Mumble audio channel is used to transmit the actual audio packets over the
+network. Unlike the TCP control channel, the audio channel uses a custom
+encoding for the audio packets. The audio channel is transport independent and
+features such as encryption are implemented by the transport layer. Integers
+above 8-bits are encoded using the `Variable length integer encoding`_.
+
+.. _packet-format:
+
+Packet format
+-------------
+
+The mumble audio channel packets are variable length packets that begin with an
+8-bit header field which describes the packet type and target. The most
+significant 3 bits define the packet type while the remaining 5 bits define the
+target. The header is followed by the packet payload. The maximum size for the
+whole audio data packet is 1020 bytes. This allows applications to use 1024
+byte buffers for receiving UDP datagrams with the 4-byte encryption header
+overhead.
+
+.. _Audio packet structure:
+.. table:: Audio packet structure
+    :class: bits8
+
+    +-------------------------------+
+    | Audio packet structure        |
+    +===+===+===+===+===+===+===+===+
+    | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
+    +---+---+---+---+---+---+---+---+
+    |  ``type`` |    ``target``     |
+    +-----------+-------------------+
+    |          Payload...           |
+    +-------------------------------+
+
+type
+  The audio packet type. The packets transmitted over the audio channel are
+  either ping packets used to diagnose the transport layer connectivity or
+  audio packets encoded with different codecs. Different types are listed in
+  `Audio packet types`_ table.
+
+.. _Audio packet types:
+.. table:: Audio packet types
+
+   +---------+---------------+--------------------------------------------+
+   | Type    |   Bitfield    | Description                                |
+   +=========+===============+============================================+
+   | ``0``   | ``000xxxxx``  | CELT Alpha encoded voice data              |
+   +---------+---------------+--------------------------------------------+
+   | ``1``   | ``001xxxxx``  | Ping packet                                |
+   +---------+---------------+--------------------------------------------+
+   | ``2``   | ``010xxxxx``  | Speex encoded voice data                   |
+   +---------+---------------+--------------------------------------------+
+   | ``3``   | ``011xxxxx``  | CELT Beta encoded voice data               |
+   +---------+---------------+--------------------------------------------+
+   | ``4``   | ``100xxxxx``  | OPUS encoded voice data                    |
+   +---------+---------------+--------------------------------------------+
+   | ``5-7`` |               | Unused                                     |
+   +---------+---------------+--------------------------------------------+
+
+target
+  The target portion defines the recipient for the audio data. The two constant
+  targets are *Normal talking* (``0``) and *Server Loopback* (``31``). The
+  range 1-30 is reserved for whisper targets. These targets are specified
+  separately in the control channel using the ``VoiceTarget`` packets. The
+  targets are listed in `Audio targets`_ table.
+
+  When a client registers a VoiceTarget on the server, it gives the target an
+  ID. This voice target ID can be used as a target in the voice packets to send
+  audio to specific users or channels. When receiving whisper-audio the server
+  uses target 1 to specify the audio results from a whisper to a channel and
+  target 2 to specify that the audio results from a direct whisper to the user.
+
+.. _Audio targets:
+.. table:: Audio targets
+
+   +-----------+-----------------------------------------------------+
+   | Target    | Description                                         |
+   +===========+=====================================================+
+   | ``0``     | Normal talking                                      |
+   +-----------+-----------------------------------------------------+
+   | ``1-30``  | Whisper target                                      |
+   |           |                                                     |
+   |           | - VoiceTarget ID when sending whisper from client.  |
+   |           | - 1 when receiving whisper to channel.              |
+   |           | - 2 when receiving direct whisper to user.          |
+   +-----------+-----------------------------------------------------+
+   | ``31``    | Server loopback                                     |
+   +-----------+-----------------------------------------------------+
+
+Ping packet
+~~~~~~~~~~~
+
+Audio channel ping packets are used as part of the connectivity checks on the
+audio transport layer. These packets contain only varint encoded timestamp as
+data.  See `UDP connectivity checks`_ section below for the logic involved in
+the connectivity checks.
+
+.. _Audio transport ping packet:
+
+.. table:: Audio transport ping packet
+
+   +------------+-------------+----------------------------------+
+   | Field      | Type        | Description                      |
+   +============+=============+==================================+
+   | Header     | ``byte``    | ``00100000b`` (``0x20``)         |
+   +------------+-------------+----------------------------------+
+   | Data       | ``varint``  | Timestamp                        |
+   +------------+-------------+----------------------------------+
+
+Header
+  Common audio packet header. For ping packets this should have the value of
+  0x20.
 
-Enabling the UDP channel
-------------------------
+Data
+  Timestamp. The packet should be echoed back so the timestamp format can be
+  decided by the original sender - the only limitation is that it must fit in a
+  64-bit integer for the varint encoding.
+
+Encoded audio data packet
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Encoded audio packets contain the actual user audio data for the voice
+communication. Incoming audio data packets contain the common header byte
+followed by varint encoded session ID of the source user and varint encoded
+sequence number of the packet. Outgoing audio data packets contain only the
+header byte and the sequence number of the packet. The server matches these to
+the correct session using the transport layer information.
+
+The remainder of the packet is made up of multiple encoded audio segments and
+optional positional audio information. The audio segment format depends on the
+codec of the whole audio packets. The audio segments contain codec
+implementation specific information on where the audio segments end so the
+possible positional audio data can be read from the end.
+
+.. _Incoming encoded audio packet:
+.. table:: Incoming encoded audio packet
+
+   +--------------------+--------------+-----------------------------------------------------------+
+   | Field              | Type         | Description                                               |
+   +====================+==============+===========================================================+
+   | Header             | ``byte``     | Codec type/Audio target                                   |
+   +--------------------+--------------+-----------------------------------------------------------+
+   | Session ID         | ``varint``   | Session ID of the source user.                            |
+   +--------------------+--------------+-----------------------------------------------------------+
+   | Sequence Number    | ``varint``   | Sequence number of the first audio data **segment**.      |
+   +--------------------+--------------+-----------------------------------------------------------+
+   | Payload            | ``byte[]``   | Audio payload                                             |
+   +--------------------+--------------+-----------------------------------------------------------+
+   | Position Info      | ``float[3]`` | Positional audio information                              |
+   +--------------------+--------------+-----------------------------------------------------------+
+
+
+.. _Outgoing encoded audio packet:
+.. table:: Outgoing encoded audio packet
+
+   +--------------------+--------------+-----------------------------------------------------------+
+   | Field              | Type         | Description                                               |
+   +====================+==============+===========================================================+
+   | Header             | ``byte``     | Codec type/Audio target                                   |
+   +--------------------+--------------+-----------------------------------------------------------+
+   | Sequence Number    | ``varint``   | Sequence number of the first audio data **segment**.      |
+   +--------------------+--------------+-----------------------------------------------------------+
+   | Payload            | ``byte[]``   | Audio payload                                             |
+   +--------------------+--------------+-----------------------------------------------------------+
+   | Position Info      | ``float[3]`` | Positional audio information                              |
+   +--------------------+--------------+-----------------------------------------------------------+
+
+Header
+  The common audio packet header
+
+Session ID
+  Session ID of the user to whom the audio packet belongs.
+
+Sequence Number
+  Audio data sequence number. The sequence number is used to maintain the
+  packet order when the audio data is transported over unreliable transports
+  such as UDP.
+
+  The sequence number might increase by more than one between subsequent audio
+  packets in case the audio packets contain multiple audio segments. This
+  allows the packet loss concealment algorithms to figure out how many audio
+  frames were lost between two received packets.
+
+Payload
+  Audio payload. Format depends on the audio codec defined in the Header. The
+  payload must be self-delimiting to determine whether the position info exists
+  at the end of the packet.
+
+Position Info
+  The XYZ coordinates of the audio source. In addition to sending the position
+  information, the user must be using a positional plugin defined in the
+  ``UserState`` message. The plugins might define different contexts which
+  prevent voice communication between users in other contexts.
+
+Speex and CELT audio frames
+"""""""""""""""""""""""""""
+
+Encoded Speex and CELT audio is transported as individual encoded frames. Each
+frame is prefixed with a single byte length and terminator header.
+
+.. _celt-encoded-audio-data:
+
+.. table:: CELT encoded audio data
+
+   +---------+-------------+-----------------------------------------+
+   | Field   | Type        | Description                             |
+   +=========+=============+=========================================+
+   | Header  | ``byte``    | length/continuation header              |
+   +---------+-------------+-----------------------------------------+
+   | Data    | ``byte[]``  | Encoded voice frame                     |
+   +---------+-------------+-----------------------------------------+
+
+Header
+  The length of the Data field. The most significant bit (``0x80``) acts as the
+  continuation bit and is set for all but the last frame in the payload. The
+  remaining 7 bits of the header contain the actual length of the Data frame.
+
+  Note the length may be zero, which is used to signal the end of a voice
+  transmission. In this case the audio data is a single zero-byte which can be
+  interpreted normally as length of 0 with no continuation bit set.
+
+Data
+  Single encoded audio frame. The encoding depends on the codec ``type`` header
+  of the whole audio packet
+
+Opus audio frames
+"""""""""""""""""
+
+Encoded Opus audio is transported as a single Opus audio frame. The frame is prefixed with a variable byte header.
+
+.. _opus-encoded-audio-data:
 
-Before the UDP channel can reliably be used both sides should
-be certain that the connection works. Before the server may use
-the UDP connection to the client the client must first open a UDP
-socket and communicate its address to the server by sending a packet
-over UDP. Once the server has received an UDP transmission the server
-should start using the UDP channel for the voice packets. Respectively 
-he client should not use the UDP channel for voice data until it is
-certain that the packets go through to the server.
-
-In practice these requirements are filled with UDP ping. When the server
-receives a UDP ping packet from the client it echoes the packet back.
-When the client receives this packet it can ascertain that the UDP channel
-works for two-way communication.
-
-.. _udp-ping-packet:
-
-.. table:: UDP ping packet
-   
-   +---------------------------------------------------------+
-   | UDP ping packet                                         |
-   +======================+==================================+
-   | byte                 | type/flags (0010 0000 for Ping)  |
-   +----------------------+----------------------------------+
-   | varint               | timestamp                        |
-   +----------------------+----------------------------------+
-
-If the client stops receiving replies to the UDP packets at some point or never
-receives the first one it should immediately start tunneling the voice communication
-through TCP as described in the *UDP tunnel* section. When the server
-receives a tunneled packet over the TCP connection it must also stop using
-the UDP for communication. The client may continue sending UDP ping packets
-over the UDP channel and the server must echo these if it receives them. If
-the client later receives these echoes it may switch back to the UDP channel
-for voice communication. When the server receives an UDP voice communication
-packet from the client it should stop tunneling the packets as well.
-
-
-.. _udp-data:
+.. table:: Opus encoded audio data
+
+   +---------+-------------+-----------------------------------------+
+   | Field   | Type        | Description                             |
+   +=========+=============+=========================================+
+   | Header  | ``varint``  | length/terminator header                |
+   +---------+-------------+-----------------------------------------+
+   | Data    | ``byte[]``  | Encoded voice frame                     |
+   +---------+-------------+-----------------------------------------+
+
+Header
+  The length of the Data field. 16-bit variable length integer encoded length
+  and terminator bit value. The varint encoding is the same as with 64-bit
+  values, but only 16-bit unencoded values are allowed.
+
+  The maximum voice frame size is 8191 (``0x1FFF``) bytes requiring the 13 least
+  significant bits of the header. The 14th bit (mask: ``0x2000``) is the terminator
+  bit which signals whether the packet is the last one in the voice
+  transmission.
+
+  Note: In CELT the "continuation bit" in the header defines whether there are
+  more audio frames in the current packet. Opus always contains only one frame
+  in the packet. In CELT the voice transmission end is signaled with a
+  zero-byte CELT packet while in Opus we have a dedicated termination bit in
+  the header.
 
 Data
-----
-
-The voice data is transmitted in variable length packets that consist of header portion,
-followed by repeated data segments and an optional position part. The full packet
-structure is shown in the figure below, and consists of three parts. The decrypted
-data should never be longer than 1020 bytes, this allows the use of 1024 byte UDP
-buffer even after the 4-byte encryption header is added to the packet during the
-encryption. The protocol transfers 64-bit integers using variable length encoding.
-This encoding is specified in the *varint* section.
-
-   A voice packet starts with a header:
-
-   +------------------------------------------------------------------------------+
-   | Voice packet header                                                          |
-   +----------------------+---------------+---------------------------------------+
-   | Type                 | Field         | Description                           |
-   +======================+===============+=======================================+
-   | byte                 | Type/Flags    | Bitfield  **76543210**   ,            |
-   |                      |               | 7-5 Type(*), 4-0 Target               |
-   +----------------------+---------------+---------------------------------------+
-   | varint               | Session       | The session number of the source user |
-   |                      |               | (only from server)                    |
-   +----------------------+---------------+---------------------------------------+
-   | varint               | Sequence      |                                       |
-   +----------------------+---------------+---------------------------------------+
-
-   Followed by one or more audio data segments:
-
-   +--------------------------------------------------------------------------------+
-   | Voice packet audio data                                                        |
-   +----------------------+---------------+-----------------------------------------+
-   | Type                 | Field         | Description                             |
-   +==============+=======+===============+=========================================+
-   | Header       | byte  | Header (CELT) | Bitfield **76543210**,                  |
-   |              |       |               | Bit 7: Terminator, Bit 6-0: Data length |
-   | depends on   +-------+---------------+-----------------------------------------+
-   |              | varint| Header (OPUS) | Bitfield **FEDCBA9876543210**           |
-   | packet type  | int16 |               | Bit D: Terminator, Bit C-0: Data length |
-   +--------------+-------+---------------+-----------------------------------------+
-   | byte[]               | Data          | Encoded voice frames                    |
-   +----------------------+---------------+-----------------------------------------+
-  
-   Followed by an optional set of positional audio coordinates:
-
-   +--------------------------------------------------------------------------------+
-   | Voice packet positional audio data                                             |
-   +----------------------+---------------+-----------------------------------------+
-   | Type                 | Field         | Description                             |
-   +======================+===============+=========================================+
-   | float                | Position 1    |                                         |
-   +----------------------+---------------+-----------------------------------------+
-   | float                | Position 2    |                                         |
-   +----------------------+---------------+-----------------------------------------+
-   | float                | Position 3    |                                         |
-   +----------------------+---------------+-----------------------------------------+
-
-The first byte of the header contains the packet type and additional target specifier.
-The format of this byte is described below. If the voice packet comes from the server,
-the type is followed by a *varint* encoded value that specifies the session this
-voice packet originated from -- this information is added by the server and the client
-omits this field. The last segment in the header is a sequence number for the first
-audio frame of the packet. If there are for example two frames in the packet, the sequence
-field of the next packet should be incremented by two.
-
-The type is stored in the first three bits and specifies the type and encoding of the packet.
-Current types are listed in *UDP Types* table. The remaining 5 bits specify additional
-packet-wide options. For voice packets the values specify the voice target as listed in the
-table below:
-
-.. _udp-types:
-
-.. table:: UDP Types
-
-   +--------------------------+--------------------------------------------+
-   | Type      Bitfield       | Description                                |
-   +==========================+============================================+
-   | 0        [000xxxxx]      | CELT Alpha encoded voice data              |
-   +--------------------------+--------------------------------------------+
-   | 1        [001xxxxx]      | Ping packet                                |
-   +--------------------------+--------------------------------------------+
-   | 2        [010xxxxx]      | Speex encoded voice data                   |
-   +--------------------------+--------------------------------------------+
-   | 3        [011xxxxx]      | CELT Beta encoded voice data               |
-   +--------------------------+--------------------------------------------+
-   | 4        [100xxxxx]      | OPUS encoded voice data                    |
-   +--------------------------+--------------------------------------------+
-   | 5-7                      | Unused                                     |
-   +--------------------------+--------------------------------------------+
-
-.. _udp-targets:
-
-.. table:: UDP targets
-
-   +-----------+--------------------------------------------+
-   | Target    | Description                                |
-   +===========+============================================+
-   | 0         | Normal talking                             |
-   +-----------+--------------------------------------------+
-   | 1         | Whisper to channel                         |
-   +-----------+--------------------------------------------+
-   | 2-30      | Direct whisper (always 2 for incoming      |
-   |           | whisper)                                   |
-   +-----------+--------------------------------------------+
-   | 31        | Server loopback                            |
-   +-----------+--------------------------------------------+
-
-The audio frames consist of one byte long header and up to 127 bytes long data portion.
-The first bit in the header is the *terminator bit* which informs the receiver
-whether there are more audio frames after this one. This bit is turned on (value *1*)
-for all but the last frame in the current UDP packet. Rest of the seven bits in the header
-specify the length of the data portion. The data portion is encoded using one of the
-supported codecs. The exact codec is specified in the type portion of the whole packet
-(See the UDP types table). *The data in each frame is encoded separately.*
+  The encoded Opus data.
 
 Codecs
 ------
 
-Mumble supports two distinct codecs; Low bit rate audio uses Speex and higher quality
-audio is encoded with CELT. Both of these codecs must be supported for full support
-of the Mumble protocol. Furthermore, as the CELT bitstream has not been frozen yet
-which places requirements for the exact CELT version: The clients must support
-CELT 0.7.1 bitstream. The protocol includes codec negotiation which allows clients
-to support other codec versions as well, in which case the server should attempt
-to negotiate a version that all clients support. The clients must respect the
-server resolution.
+Mumble supports three distinct codecs; Older Mumble versions use Speex for low
+bitrate audio and CELT for higher quality audio while new Mumble versions
+prefer Opus for all audio. When multiple clients with different capabilities
+communicate together the server is responsible for resolving the codec to use.
+The clients should respect the server resolution if they are capable.
+
+If the server resolves a codec a client doesn't support, that client is free to
+use any codec it prefers. Usually this means the client will not be able to
+decode incoming audio, but it can still send encoded audio out.
+
+The CELT bitstream was never frozen which makes most CELT versions incompatible
+with each other. The two CELT bitstreams supported by Mumble are: CELT 0.7.0
+(CELT Alpha) and CELT 0.11.0 (CELT Beta). While CELT 0.7.0 should technically
+be supported by most Mumble implementations, some servers might be configured
+to force Opus codec for the users. Mumble has had Opus support since 1.2.4
+(June 2013) so it should be safe to assume most clients in use support this
+now.
 
 Whispering
 ----------
@@ -184,123 +294,100 @@ use whispering. This is achieved by registering a voice target using the
 VoiceTarget message and specifying the target ID as the target in the first
 byte of the UDP packet.
 
-Varint and 64-bit integer encoding
-----------------------------------
-
-The variable length integer encoding is used to encode long, 64-bit,
-integers so that short values do not need the full 8 bytes to be transferred.
-The basic idea behind the encoding is prefixing the value with a length prefix
-and then removing the leading zeroes from the value. The positive numbers are
-always right justified. That is to say that the least significant bit in the
-encoded presentation matches the least significant bit in the decoded presentation.
-The *varint prefixes* table contains the definitions of the different length
-prefixes. The encoded **x** bits are part of the decoded number while the **_**
-signifies a unused bit. Encoding should be done by searching the first decoded
-description that fits the number that should be decoded, truncating it to the
-required bytes and combining it with the defined encoding prefix. 
-
-See the *quint64* shift operators in
-https://github.com/mumble-voip/mumble/blob/master/src/PacketDataStream.h
-for a reference implementation.
+UDP connectivity checks
+-----------------------
+
+Since UDP is a connectionless protocol, it is heavily affected by network
+topology such as NAT configuration. It should not be used for audio
+transmission before the connectivity has been determined.
+
+The client starts the connectivity checks by sending a `Ping packet`_ to the
+server. When the server receives this packet it will respond by echoing it back
+to the address it received it from. Once the client receives the response from
+the server it can start using the UDP transport for audio data. When the server
+receives incoming audio data over the UDP transport it can switch the outgoing
+audio over to UDP transport as well.
+
+If the client stops receiving replies to the UDP pings at some point, it should
+start tunneling the voice communication through the TCP tunnel as described in
+the `Tunneling audio over TCP`_ below. When the server receives a tunneled
+packet over the TCP connection it must also stop using the UDP for
+communication. The client should still continue sending audio ping packets over
+the UDP transport in case the UDP connection is restored and the communication
+can be switched back to it.
+
+Tunneling audio over TCP
+------------------------
 
-.. table:: Varint prefixes
+If the UDP channel isn't available the voice packets can be transmitted through
+the TCP transport used for the control channel. These messages use the normal
+TCP prefixing, as shown in figure :ref:`mumble-packet`: 16-bit message type
+followed by 32-bit message length. However unlike other TCP messages, the audio
+packets are not encoded as protocol buffer messages but instead the raw audio
+packet described in `Packet format`_ should be written to the TCP socket
+verbatim.
 
-   +-----------------------------------+--------------------------------------------------------+
-   | Encoded                           | Decoded                                                |
-   +===================================+========================================================+
-   | **0xxxxxxx**                      | 1 byte with :math:`7 \cdot 8 + 1` leading zeroes       |
-   +-----------------------------------+--------------------------------------------------------+
-   | **10xxxxxx** + 1 byte             | 2 bytes with :math:`6 \cdot 8 + 2` leading zeroes      |
-   +-----------------------------------+--------------------------------------------------------+
-   | **110xxxxx** + 2 bytes            | 3 bytes with :math:`5 \cdot 8 + 3` leading zeroes      |
-   +-----------------------------------+--------------------------------------------------------+
-   | **1110xxxx** + 3 bytes            | 4 bytes with :math:`4 \cdot 8 + 4` leading zeroes      |
-   +-----------------------------------+--------------------------------------------------------+
-   | **111100__** + **int** (4 bytes)  | 32-bit positive number                                 |
-   +-----------------------------------+--------------------------------------------------------+
-   | **111101__** + **long** (8 bytes) | 64-bit number                                          |
-   +-----------------------------------+--------------------------------------------------------+
-   | **111110__** + **varint**         | Negative varint                                        |
-   +-----------------------------------+--------------------------------------------------------+
-   | **111111xx**                      | Byte-inverted negative two byte number (~xx)           |
-   +-----------------------------------+--------------------------------------------------------+
-
-The variable length integer encoding is used to encode long (64-bit) integers so that
-short values do not need the full 8 bytes to be transferred. The encoding function is
-given below. While it might seem complex it is worth noting that the
-:math:`(a_v, a_p) \append (b_v, b_p)` function equals appending the :math:`a_p` bits
-long value :math:`a_v` to a byte stream that already has the :math:`b_p` bits long
-value :math:`b_v`.
-
-.. % Encoding function
-.. % \begin{align*}
-.. % 	(a_v, a_p) \append (b_v, b_p) &= (2^{b_p} a_v + b_v, a_p + b_p) \\
-.. % %
-.. % 	e &: \mathbb{N} \rightarrow \mathbb{N}_{\geq0}^2 \\
-.. % 	e(x) &= \begin{dcases*}
-.. % 			e_+(x, 1)										& when $ 0 \leq x < 2^{28} $ \\
-.. % 			\left((2^8 - 2^4) \cdot {2^8}^4 + x, 2^{40}\right)			& when $ 2^{28} \leq x < 2^{32} $ \\
-.. % 			\left((2^8 - 2^4 + 2^2) \cdot {2^8}^8 + x, 2^{72}\right)	& when $ 2^{32} \leq x $ \\
-.. % 			(2^8 - 2^2 - x, 8)								& when $ -4 < x < 0 $ \\
-.. % 			(2^8 - 2^3, 8) \append e(-x)					& when $ x \leq -4 $ \\
-.. % 		\end{dcases*} \\
-.. % %
-.. % 	e_+(x, b) &= \begin{dcases*}
-.. % 			(p(b) + x, 8)												& when $ r < 2^(8-b) $ \\
-.. % 			e_+\left(\left\lfloor \frac{x}{2^8} \right\rfloor, b + 1\right) \append (x \bmod 2^8, 8)	& when $ r \geq 2^(8-b) $
-.. % 		\end{dcases*} \\
-.. % %
-.. % 	p(b) &= 2^8 - 2^{9-b}
-.. % \end{align*}
-.. 
-.. % Decoding is performed by analyzing the first byte after which the rest of the number can be read from the byte stream.
-.. 
-.. % Decoding function
-.. % \begin{align*}
-.. % 	s_0(x) &= 8 - \left\lfloor log_2(2^8-1 - x) \right\rfloor \\
-.. % %
-.. % 	f_x &: \mathbb{N}_{\geq0} \rightarrow [0, 2^8) \\
-.. % 	d &: f \rightarrow \mathbb{N}, f = \{ f_1, f_2, f_3, ... \} \\
-.. % 	d(f) &= \begin{dcases*}
-.. % 			d_+\Big(f, s_0\big(f(0)\big)\Big)													& when $f(0) \leq 2^8 - 2^4 $ \\
-.. % 			\sum_{i=0}^4 2^{32-8i}f(i)								& when $f(0) = 2^8 - 2^4 $ \\
-.. % 			\sum_{i=0}^8 2^{64-8i}f(i)								& when $f(0) = 2^8 - 2^4 + 2^2 $ \\
-.. % 			-d(g : g(n) = f(n+1))									& when $f(0) = 2^8 - 2^3 $ \\
-.. % 			(2^8 - 2^2) - f(0)										& when $f(0) \geq 2^8 - 2^2 $ \\
-.. % 		\end{dcases*} \\
-.. % %
-.. % 	d_+(f, z) &= -2^{8z - 7z} + \sum_{i=1}^z 2^{8z-8i}f(i-1)
-.. % \end{align*}
-
-.. _tcp-tunnel:
-
-TCP tunnel
-----------
+When the packets are received it is safe to parse the type and length fields
+normally.  If the type matches that of the audio tunnel the rest of the message
+should be processed as an UDP packet without attempting a protocol buffer
+decoding.
 
-If the UDP channel isn't available the voice packets must be transmitted
-through the TCP socket. These messages use the normal TCP prefixing, as seen in shown in
-figure :ref:`mumble-packet`: 16-bit message type followed by 32-bit message
-length. However unlike other TCP messages, the UDP packets are not encoded as
-protocol buffer messages but instead the raw UDP packet described in section :ref:`udp-data`
-should be written to the TCP socket directly.
+Implementation note
+~~~~~~~~~~~~~~~~~~~
 
-When the packets are received it is safe to parse the type and length fields normally.
-If the type matches that of the UDP tunnel the rest of the message should be processed
-as an UDP packet without attempting a protocol buffer decoding.
+When implementing the protocol it is easier to ignore the UDP transfer layer at
+first and just tunnel the UDP data through the TCP tunnel. The TCP layer must
+be implemented for authentication in any case. Making sure that the voice
+transmission works before implementing the UDP protocol simplifies debugging
+greatly.
 
 Encryption
 ----------
 
-All the packets are encrypted once during transfer. The actual encryption depends on the
-used transport layer. If the packets are tunneled through TCP they are encrypted using the
-TLS that encrypts the whole TCP connection and if they are sent directly using UDP they must
-be encrypted using the OCB-AES128 encryption.
+All the packets are encrypted once during transfer. The actual encryption
+depends on the used transport layer. If the packets are tunneled through TCP
+they are encrypted using the TLS that encrypts the whole control channel
+connection and if they are sent directly using UDP they must be encrypted using
+the OCB-AES128 encryption.
+
+Variable length integer encoding
+--------------------------------
+
+The variable length integer encoding (``varint``) is used to encode long,
+64-bit, integers so that short values do not need the full 8 bytes to be
+transferred. The basic idea behind the encoding is prefixing the value with a
+length prefix and then removing the leading zeroes from the value. The positive
+numbers are always right justified. That is to say that the least significant
+bit in the encoded presentation matches the least significant bit in the
+decoded presentation.  The *varint prefixes* table contains the definitions of
+the different length prefixes. The encoded ``x`` bits are part of the decoded
+number while the ``_`` signifies a unused bit. Encoding should be done by
+searching the first decoded description that fits the number that should be
+decoded, truncating it to the required bytes and combining it with the defined
+encoding prefix.
 
-Implementation notes
---------------------
+See the *quint64* shift operators in
+https://github.com/mumble-voip/mumble/blob/master/src/PacketDataStream.h
+for a reference implementation.
 
-When implementing the protocol it is easier to ignore the UDP transfer layer at
-first and just tunnel the UDP data through the TCP tunnel. The TCP layer must be implemented
-for authentication in any case. Making sure that the voice transmission works before
-implementing the UDP protocol simplifies debugging greatly. The UDP protocol is a required
-part of the specification though.
+.. table:: Varint prefixes
+
+   +----------------------------------+--------------------------------------------------------+
+   | Encoded                          | Decoded                                                |
+   +==================================+========================================================+
+   | ``0xxxxxxx``                     | 7-bit positive number                                  |
+   +----------------------------------+--------------------------------------------------------+
+   | ``10xxxxxx`` + 1 byte            | 14-bit positive number                                 |
+   +----------------------------------+--------------------------------------------------------+
+   | ``110xxxxx`` + 2 bytes           | 21-bit positive number                                 |
+   +----------------------------------+--------------------------------------------------------+
+   | ``1110xxxx`` + 3 bytes           | 28-bit positive number                                 |
+   +----------------------------------+--------------------------------------------------------+
+   | ``111100__`` + ``int`` (32-bit)  | 32-bit positive number                                 |
+   +----------------------------------+--------------------------------------------------------+
+   | ``111101__`` + ``long`` (64-bit) | 64-bit number                                          |
+   +----------------------------------+--------------------------------------------------------+
+   | ``111110__`` + ``varint``        | Negative recursive varint                              |
+   +----------------------------------+--------------------------------------------------------+
+   | ``111111xx``                     | Byte-inverted negative two bit number (``~xx``)        |
+   +----------------------------------+--------------------------------------------------------+