Skip to content

Commit

Permalink
Face detection: add specification
Browse files Browse the repository at this point in the history
Add specifications for human face metadata and related constraints,
capabilities, and settings. Add also corresponding examples.
  • Loading branch information
ttoivone committed Nov 22, 2022
1 parent ec904d3 commit 5f8b11b
Showing 1 changed file with 380 additions and 1 deletion.
381 changes: 380 additions & 1 deletion index.html
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
// See https://github.com/w3c/respec/wiki/ for how to configure ReSpec
var respecConfig = {
group: "webrtc",
xref: ["html", "infra", "permissions", "dom", "mediacapture-streams", "webaudio", "webidl"],
xref: ["geometry-1", "html", "infra", "permissions", "dom", "image-capture", "mediacapture-streams", "webaudio", "webcodecs", "webidl"],
edDraftURI: "https://w3c.github.io/mediacapture-extensions/",
editors: [
{name: "Jan-Ivar Bruaroey", company: "Mozilla Corporation", w3cid: 79152},
Expand Down Expand Up @@ -654,5 +654,384 @@ <h2>Exposing change of MediaStreamTrack configuration</h2>
</p>
</div>
</section>
<section>
<h2>Human faces</h2>
<p>Human face metadata describes the human faces in video frames. It can
be set by web applications using the standard means when creating
{{VideoFrameMetadata}} for {{VideoFrame}}s or it can be set by an user agent
when the media track constraints, defined below, are used to
enable face detection for the {{MediaStreamTrack}} which provides the
{{VideoFrame}}s.</p>
<p>The facial metadata may be used by video encoders to enhance the quality
of the faces in encoded video streams or for other suitable purposes.</p>
<section>
<h3>{{VideoFrameMetadata}}</h3>
<pre class="idl">
partial dictionary VideoFrameMetadata {
sequence&lt;HumanFace&gt; humanFaces;
};</pre>
<section class="notoc">
<h4>Members</h4>
<dl class="dictionary-members" data-link-for="VideoFrameMetadata" data-dfn-for="VideoFrameMetadata">
<dt><dfn><code>humanFaces</code></dfn> of type <span class="idlMemberType"><code>sequence&lt;{{HumanFace}}&gt;</code></span></dt>
<dd>
<p>The set of known human faces in this video frame.</p>
</dd>
</dl>
</section>
</section>
<section>
<h3>{{HumanFace}}</h3>
<pre class="idl">
dictionary HumanFace {
long id;
float probability;
DOMRectReadOnly boundingBox;
HumanFaceLandmark leftEye;
HumanFaceLandmark rightEye;
HumanFaceLandmark mouth;
};</pre>
<section class="notoc">
<h4>Dictionary {{HumanFace}} Members</h4>
<dl class="dictionary-members" data-dfn-for="HumanFace" data-link-for="HumanFace">
<dt><dfn><code>id</code></dfn> of type <span class="idlMemberType">{{long}}</span></dt>
<dd>
<p>If set, an unique identifer of a face within a sequence. If the same face can be tracked in multiple
frames originating from the same {{MediaStreamTrack}} source, {{id}} is set to the same integer value
for the face in all frames.</p>
<p>User agent MUST NOT select the assigned value of {{id}} in such a way that the detected faces could
be correlated to match in any way between different {{MediaStreamTrack}} objects.</p>
</dd>
<dt><dfn><code>probability</code></dfn> of type <span class="idlMemberType">{{float}}</span></dt>
<dd>
<p>If set, a probability value in range (0,1].
The approximate probability of the {{HumanFace}} specifying an actual human face.</p>
</dd>
<dt><dfn><code>boundingBox</code></dfn> of type <span class="idlMemberType"><code>{{DOMRectReadOnly}}</code></span></dt>
<dd>
<p>A bounding box surrounding the face. The corner coordinates of the
bounding box are interpreted to represent a coordinate in a
normalized square space. The origin of coordinates {x,y} =
{0.0, 0.0} represents the upper left corner whereas the {x,y} =
{1.0, 1.0} represents the lower right corner relative to the
rendered frame.</p>
<p>The face location in the frame may be specified even if it is
obscured by other objects in front of it or it lies partially or
fully outside of the frame.</p>
</dd>
<dt><dfn><code>leftEye</code></dfn> of type <span class="idlMemberType"><code>{{HumanFaceLandmark}}</code></span></dt>
<dd>
<p>If exists, describes the face left eye properties.</p>
</dd>
<dt><dfn><code>rightEye</code></dfn> of type <span class="idlMemberType"><code>{{HumanFaceLandmark}}</code></span></dt>
<dd>
<p>If exists, describes the face right eye properties.</p>
</dd>
<dt><dfn><code>mouth</code></dfn> of type <span class="idlMemberType"><code>{{HumanFaceLandmark}}</code></span></dt>
<dd>
<p>If exists, describes the face mouth properties.</p>
</dd>
</dl>
</section>
</section>
<section>
<h3>{{HumanFaceLandmark}}</h3>
<pre class="idl">dictionary HumanFaceLandmark {
Point2D centerPoint;
};</pre>
<section class="notoc">
<h4>Dictionary {{HumanFaceLandmark}} Members</h4>
<dl class="dictionary-members" data-dfn-for="HumanFaceLandmark" data-link-for="HumanFaceLandmark">
<dt><dfn><code>centerPoint</code></dfn> of type <span class="idlMemberType">{{Point2D}}</span></dt>
<dd><p>The coordinates of the approximate center of the detected landmark.
The landmark location in the frame may be specified even if it is
obscured by other objects in front of it or it lies partially or
fully outside of the frame. See member {{HumanFace/boundingBox}} for definition
of the coordinate system.</p>
</dd>
</dl>
</section>
</section>
<section>
<h3>{{MediaTrackSupportedConstraints}}</h3>
<pre class="idl">
partial dictionary MediaTrackSupportedConstraints {
boolean humanFaceDetectionMode = true;
boolean humanFaceLandmarkDetectionMode = true;
};</pre>
<section class="notoc">
<h4>Dictionary {{MediaTrackSupportedConstraints}} Members</h4>
<dl class="dictionary-members" data-dfn-for="MediaTrackSupportedConstraints" data-link-for="MediaTrackSupportedConstraints">
<dt><dfn><code>humanFaceDetectionMode</code></dfn> of type <span class="idlMemberType">{{boolean}}</span>, defaulting to <code>true</code></dt>
<dd>
<p>Whether <a>face detection mode</a> constraining is recognized.</p>
</dd>
<dt><dfn><code>humanFaceLandmarkDetectionMode</code></dfn> of type <span class="idlMemberType">{{boolean}}</span>, defaulting to <code>true</code></dt>
<dd>
<p>Whether <a>face landmark detection mode</a> constraining is recognized.</p>
</dd>
</dl>
</section>
</section>
<section>
<h3>{{MediaTrackCapabilities}}</h3>
<pre class="idl">
partial dictionary MediaTrackCapabilities {
sequence&lt;DOMString&gt; humanFaceDetectionMode;
sequence&lt;DOMString&gt; humanFaceLandmarkDetectionMode;
};</pre>
<section class="notoc">
<h4>Dictionary {{MediaTrackCapabilities}} Members</h4>
<dl class="dictionary-members" data-dfn-for="MediaTrackCapabilities" data-link-for="MediaTrackCapabilities">
<dt><dfn><code>humanFaceDetectionMode</code></dfn> of type <span class="idlMemberType">sequence&lt;{{DOMString}}&gt;</span></dt>
<dd>
<p>The sequence of supported <a>face detection modes</a>.
Each string MUST be one of the members of {{ObjectDetectionMode}}. The mode {{ObjectDetectionMode/"center-point"}} must not be supported.</p>
</dd>
<dt><dfn><code>humanFaceLandmarkDetectionMode</code></dfn> of type <span class="idlMemberType">sequence&lt;{{DOMString}}&gt;</span></dt>
<dd>
<p>The sequence of supported <a>face landmark detection modes</a>.
Each string MUST be one of the members of {{ObjectDetectionMode}}. The mode {{ObjectDetectionMode/"bounding-box"}} must not be supported.</p>
</dd>
</dl>
</section>
</section>
<section>
<h3>{{MediaTrackConstraintSet}}</h3>
<pre class="idl">
partial dictionary MediaTrackConstraintSet {
ConstrainDOMString humanFaceDetectionMode;
ConstrainDOMString humanFaceLandmarkDetectionMode;
};</pre>
<section class="notoc">
<h4>Dictionary {{MediaTrackConstraintSet}} Members</h4>
<dl class="dictionary-members" data-dfn-for="MediaTrackConstraintSet" data-link-for="MediaTrackConstraintSet">
<dt><dfn><code>humanFaceDetectionMode</code></dfn> of type <span class="idlMemberType">{{ConstrainDOMString}}</span></dt>
<dd>
<p>The string MUST be one of the members of {{ObjectDetectionMode}}.
See <a>face detection mode</a> constrainable property.</p>
</dd>
<dt><dfn><code>humanFaceLandmarkDetectionMode</code></dfn> of type <span class="idlMemberType">{{ConstrainDOMString}}</span></dt>
<dd>
<p>The string MUST be one of the members of {{ObjectDetectionMode}}.
See <a>face landmark detection mode</a> constrainable property.</p>
</dd>
</dl>
</section>
</section>
<section>
<h3>{{MediaTrackSettings}}</h3>
<pre class="idl">
partial dictionary MediaTrackSettings {
DOMString humanFaceDetectionMode;
DOMString humanFaceLandmarkDetectionMode;
};</pre>
<section class="notoc">
<h4>Dictionary {{MediaTrackSettings}} Members</h4>
<dl class="dictionary-members" data-dfn-for="MediaTrackSettings" data-link-for="MediaTrackSettings">
<dt><dfn><code>humanFaceDetectionMode</code></dfn> of type <span class="idlMemberType">{{DOMString}}</span></dt>
<dd>
<p>Current <a>face detection mode</a> setting.
The string MUST be one of the members of {{ObjectDetectionMode}} excluding {{ObjectDetectionMode/"center-point"}}.</p>
</dd>
<dt><dfn><code>humanFaceLandmarkDetectionMode</code></dfn> of type <span class="idlMemberType">{{DOMString}}</span></dt>
<dd>
<p>Current <a>face landmark detection mode</a> setting.
The string MUST be one of the members of {{ObjectDetectionMode}} excluding {{ObjectDetectionMode/"bounding-box"}}.</p>
</dd>
</dl>
</section>
</section>
<section>
<h3>{{ObjectDetectionMode}}</h3>
<pre class="idl">
enum ObjectDetectionMode {
"none",
"center-point",
"bounding-box",
};</pre>
<section class="notoc">
<h4>{{ObjectDetectionMode}} Enumeration Description</h4>
<dl data-dfn-for="ObjectDetectionMode" data-link-for="ObjectDetectionMode">
<dt><dfn><code>none</code></dfn></dt>
<dd>
<p>This source does not offer object detection for the corresponding object type.
For setting, this is interpreted as a command to turn off
the setting of the corresponding object type detection metadata.</p>
</dd>
<dt><dfn><code>center-point</code></dfn></dt>
<dd>
<p>This source offers detection of the center point of the corresponding
object type, either a human face or a face landmark, or such a mode is requested.</p>
</dd>
<dt><dfn><code>bounding-box</code></dfn></dt>
<dd>
<p>This source offers detection of the bounding box of the corresponding
object type, either a human face or a face landmark, or such a mode is requested.</p>
</dd>
</dl>
</section>
</section>
<section>
<h3>Constrainable Properties</h3>
<ol>
<li>
<p><dfn>Face detection mode</dfn> describes which face properties
are to be detected and set in the metadata for the video frame.</p>
</li>
<li>
<p><dfn>Face landmark detection mode</dfn> describes which face landmark
properties are to be detected and set in the metadata for the video frame.</p>
</li>
</ol>
</section>
<section>
<h3>Examples</h3>
<pre class="example">
// main.js:
// Check if face detection is supported by the browser
const supports = navigator.mediaDevices.getSupportedConstraints();
if (supports.humanFaceDetectionMode) {
// Browser supports face contour detection.
} else {
throw('Face contour detection is not supported');
}

// Open camera with face detection enabled
const stream = await navigator.mediaDevices.getUserMedia({
video: { humanFaceDetectionMode: 'bounding-box' }
});
const [videoTrack] = stream.getVideoTracks();

// Use a video worker and show to user.
const videoElement = document.querySelector('video');
const videoWorker = new Worker('video-worker.js');
videoWorker.postMessage({track: videoTrack}, [videoTrack]);
const {data} = await new Promise(r => videoWorker.onmessage);
videoElement.srcObject = new MediaStream([data.videoTrack]);

// video-worker.js:
self.onmessage = async ({data: {track}}) => {
const generator = new VideoTrackGenerator();
parent.postMessage({videoTrack: generator.track}, [generator.track]);
const {readable} = new MediaStreamTrackProcessor({track});
const transformer = new TransformStream({
async transform(frame, controller) {
for (const face of frame.metadata().humanFaces || []) {
if (face.boundingBox) {
console.log(
`Face @ (${face.boundingBox.left}, ${face.face.boundingBox.top}), ` +
`(${face.boundingBox.right}, ${face.face.boundingBox.bottom})`);
}
}
controller.enqueue(frame);
}
});
await readable.pipeThrough(transformer).pipeTo(generator.writable);
};
</pre>
<pre class="example">
// main.js:
// Open camera.
const stream = navigator.mediaDevices.getUserMedia({video: true});
const [videoTrack] = stream.getVideoTracks();

// Use a video worker and show to user.
const videoElement = document.querySelector('video');
const videoWorker = new Worker('video-worker.js');
videoWorker.postMessage({track: videoTrack}, [videoTrack]);
const {data} = await new Promise(r => videoWorker.onmessage);
videoElement.srcObject = new MediaStream([data.videoTrack]);

// video-worker.js:
self.onmessage = async ({data: {track}}) => {
// Apply constraints.
let customBackgroundBlur = true;
let customEyeGazeCorrection = true;
let customFaceDetection = false;
let faceDetectionMode;
let landmarkDetectionMode;
const capabilities = track.getCapabilities();
if (capabilities.backgroundBlur &amp;&amp; capabilities.backgroundBlur.max &gt; 0) {
// The platform supports background blurring.
// Let's use platform background blurring and skip the custom one.
await track.applyConstraints({
advanced: [{backgroundBlur: capabilities.backgroundBlur.max}]
});
customBackgroundBlur = false;
} else if ((capabilities.humanFaceDetectionMode || []).includes('bounding-box')) {
// The platform supports face bounding box detection but not background
// blurring. Let's use platform face detection to aid custom
// background blurring.
faceDetectionMode = 'bounding-box';
await videoTrack.applyConstraints({
advanced: [{ humanFaceDetectionMode: faceDetectionMode }]
});
} else {
// The platform does not support background blurring nor face contour
// detection. Let's use custom face contour detection to aid custom
// background blurring.
customFaceDetection = true;
}
if ((capabilities.eyeGazeCorrection || []).includes(true)) {
// The platform supports eye gaze correction.
// Let's use platform eye gaze correction and skip the custom one.
await videoTrack.applyConstraints({
advanced: [{eyeGazeCorrection: true}]
});
customEyeGazeCorrection = false;
} else if ((capabilities.humanFaceLandmarkDetectionMode || []).includes('center-point')) {
// The platform supports face landmark detection but not eye gaze
// correction. Let's use platform face landmark detection to aid custom eye
// gaze correction.
landmarkDetectionMode = 'center-point';
await videoTrack.applyConstraints({
advanced: [{ humanFaceLandmarkDetectionMode: landmarkDetectionMode }]
});
} else {
// The platform does not support eye gaze correction nor face landmark
// detection. Let's use custom face landmark detection to aid custom eye
// gaze correction.
customFaceDetection = true;
}

// Load custom libraries which may utilize TensorFlow and/or WASM.
const requiredScripts = [].concat(
customBackgroundBlur ? 'background.js' : [],
customEyeGazeCorrection ? 'eye-gaze.js' : [],
customFaceDetection ? 'face.js' : []
);
importScripts(...requiredScripts);

const generator = new VideoTrackGenerator();
parent.postMessage({videoTrack: generator.track}, [generator.track]);
const {readable} = new MediaStreamTrackProcessor({track});
const transformer = new TransformStream({
async transform(frame, controller) {
// Detect faces or retrieve detected faces.
const humanFaces =
customFaceDetection
? await detectFaces(frame)
: frame.metadata().humanFaces;
// Blur the background if needed.
if (customBackgroundBlur) {
const newFrame = await blurBackground(frame, humanFaces);
frame.close();
frame = newFrame;
}
// Correct the eye gaze if needed.
if (customEyeGazeCorrection &amp;&amp; (humanFaces || []).length &gt; 0) {
const newFrame = await correctEyeGaze(frame, humanFaces);
frame.close();
frame = newFrame;
}
controller.enqueue(frame);
}
});
await readable.pipeThrough(transformer).pipeTo(generator.writable);
};
</pre>
</section>
</section>
</body>
</html>

0 comments on commit 5f8b11b

Please sign in to comment.