Add captureTimestamp and senderCaptureTimeOffset to frame metadata

Fixes #225
w3c · Jun 20, 2024 · 1613039 · 1613039
1 parent 4b61373
commit 1613039
Showing 1 changed file with 79 additions and 0 deletions.
diff --git a/index.bs b/index.bs
@@ -48,6 +48,11 @@ spec:webidl; type:dfn; text:resolve
    "CloneArrayBuffer": {
     "href": "https://tc39.es/ecma262/#sec-clonearraybuffer",
     "title": "CloneArrayBuffer"
+   },
+   "RTP-EXT-CAPTURE-TIME": {
+    "href": "https://webrtc.googlesource.com/src/+/refs/heads/main/docs/native-code/rtp-hdrext/abs-capture-time",
+    "title": "RTP Header Extension for Absolute Capture Time",
+    "publisher": "WebRTC Project"
    }
 }
 </pre>
@@ -134,6 +139,20 @@ The <dfn abstract-op>readEncodedData</dfn> algorithm is given a |rtcObject| as p
 1. Let |frame| be the newly produced frame.
 1. Set |frame|.`[[owner]]` to |rtcObject|.
 1. Set |frame|.`[[counter]]` to |rtcObject|.`[[lastEnqueuedFrameCounter]]`.
+1. If the frame has been produced by a {{RTCRtpReceiver}}:
+    1. If the relevant RTP packet contains the
+        [[RTP-EXT-CAPTURE-TIME|RTP Header Extension for Absolute Capture Time]], set |frame|.`[[captureTimestamp]]` to the
+        [[RTP-EXT-CAPTURE-TIME#absolute-capture-timestamp|absolute capture timestamp]] field and set |frame|.`[[senderCaptureTimeOffset]]`
+        to the [[RTP-EXT-CAPTURE-TIME#estimated-capture-clock-offset|capture clock offset field]] if it is present.
+    1. Otherwise, if the relevant RTP packet does not contain the
+        [[RTP-EXT-CAPTURE-TIME|RTP Header Extension for Absolute Capture Time]] but a previous RTP packet did,
+        set |frame|.`[[captureTimestamp]]` to the result of calculating the absolute capture timestamp according to
+        [[RTP-EXT-CAPTURE-TIME#timestamp-interpolation|timestamp interpolation]] and set |frame|.`[[senderCaptureTimeOffset]]`
+        to the most recent value that was present.
+    1. Otherwise, set |frame|.`[[captureTimestamp]]` to undefined and set |frame|.`[[senderCaptureTimeOffset]]` to undefined.
+1. If the frame has been produced by a {{RTCRtpSender}}, set |frame|.`[[captureTimestamp]]` to the capture timestamp
+    using the methodology described in [[RTP-EXT-CAPTURE-TIME#absolute-capture-timestamp]] and set frame.`[[senderCaptureTimeOffset]]`
+    to undefined.
 1. [=ReadableStream/Enqueue=] |frame| in |rtcObject|.`[[readable]]`.
 
 The <dfn abstract-op>writeEncodedData</dfn> algorithm is given a |rtcObject| as parameter and a |frame| as input. It is defined by running the following steps:
@@ -293,6 +312,10 @@ The <dfn method for="SFrameTransform">setEncryptionKey(|key|, |keyID|)</dfn> met
 
 # RTCRtpScriptTransform # {#scriptTransform}
 
+In this section, the capture system refers to the system where media is sourced from and the sender system
+refers to the system that is sending RTP and RTCP packets to the receiver system where {{RTCEncodedVideoFrameMetadata}} data
+or {{RTCEncodedAudioFrameMetadata}} data is populated.
+
 ## <dfn enum>RTCEncodedVideoFrameType</dfn> dictionary ## {#RTCEncodedVideoFrameType}
 <pre class="idl">
 // New enum for video frame types. Will eventually re-use the equivalent defined
@@ -358,6 +381,8 @@ dictionary RTCEncodedVideoFrameMetadata {
     sequence&lt;unsigned long&gt; contributingSources;
     long long timestamp;    // microseconds
     unsigned long rtpTimestamp;
+    DOMHighResTimeStamp captureTimestamp;
+    DOMHighResTimeStamp senderCaptureTimeOffset;
     DOMString mimeType;
 };
 </pre>
@@ -431,6 +456,32 @@ dictionary RTCEncodedVideoFrameMetadata {
             that reflects the sampling instant of the first octet in the RTP data packet.
         </p>
     </dd>
+    <dt>
+        <dfn dict-member>captureTimestamp</dfn> <span class="idlMemberType">DOMHighResTimeStamp</span>
+    </dt>
+    <dd>
+        <p>
+            The {{RTCEncodedVideoFrameMetadata/captureTimestamp}} is set by the frame source, and for frames that come
+            from the {{RTCRtpReceiver}}, it is extracted by the [[#stream-processing]] algorithm. Its reference clock
+            is the capture system's NTP clock (same clock used to generate NTP timestamps for RTCP sender reports on
+            that system).
+
+            On populating this member, the user agent MUST return the value of the frame's `[[captureTimestamp]]` slot.
+        </p>
+    </dd>
+    <dt>
+        <dfn dict-member>senderCaptureTimeOffset</dfn> <span class="idlMemberType">DOMHighResTimeStamp</span>
+    </dt>
+    <dd>
+        <p>
+            The {{RTCEncodedVideoFrameMetadata/senderCaptureTimeOffset}} is the sender system's estimate of the offset
+            between its own NTP clock and the capture system's NTP clock, for the same frame that the
+            {{RTCEncodedVideoFrameMetadata/captureTimestamp}} was originated from. It is extracted by the
+            [[#stream-processing]] algorithm.
+
+            On populating this member, the user agent MUST return the value of the frame's `[[senderCaptureTimeOffset]]` slot.
+        </p>
+    </dd>
     <dt>
         <dfn dict-member>mimeType</dfn> <span class="idlMemberType">DOMString</span>
     </dt>
@@ -611,6 +662,8 @@ dictionary RTCEncodedAudioFrameMetadata {
     sequence&lt;unsigned long&gt; contributingSources;
     short sequenceNumber;
     unsigned long rtpTimestamp;
+    DOMHighResTimeStamp captureTimestamp;
+    DOMHighResTimeStamp senderCaptureTimeOffset;
     DOMString mimeType;
 };
 </pre>
@@ -664,6 +717,32 @@ dictionary RTCEncodedAudioFrameMetadata {
             that reflects the sampling instant of the first octet in the RTP data packet.
         </p>
     </dd>
+    <dt>
+        <dfn dict-member>captureTimestamp</dfn> <span class="idlMemberType">DOMHighResTimeStamp</span>
+    </dt>
+    <dd>
+        <p>
+            The {{RTCEncodedAudioFrameMetadata/captureTimestamp}} is set by the frame source, and for frames that come
+            from the {{RTCRtpReceiver}}, it is extracted by the [[#stream-processing]] algorithm. Its reference clock
+            is the capture system's NTP clock (same clock used to generate NTP timestamps for RTCP sender reports on
+            that system).
+
+            On populating this member, the user agent MUST return the value of the frame's `[[captureTimestamp]]` slot.
+        </p>
+    </dd>
+    <dt>
+        <dfn dict-member>senderCaptureTimeOffset</dfn> <span class="idlMemberType">DOMHighResTimeStamp</span>
+    </dt>
+    <dd>
+        <p>
+            The {{RTCEncodedAudioFrameMetadata/senderCaptureTimeOffset}} is the sender system's estimate of the offset
+            between its own NTP clock and the capture system's NTP clock, for the same frame that the
+            {{RTCEncodedAudioFrameMetadata/captureTimestamp}} was originated from. It is extracted by the
+            [[#stream-processing]] algorithm.
+
+            On populating this member, the user agent MUST return the value of the frame's `[[senderCaptureTimeOffset]]` slot.
+        </p>
+    </dd>
     <dt>
         <dfn dict-member>mimeType</dfn> <span class="idlMemberType">DOMString</span>
     </dt>