Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 89 additions & 89 deletions examples/README.md

Large diffs are not rendered by default.

53 changes: 52 additions & 1 deletion examples/annotation_import/audio.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@
},
{
"metadata": {},
"source": "ontology_builder = lb.OntologyBuilder(classifications=[\n lb.Classification(class_type=lb.Classification.Type.TEXT,\n name=\"text_audio\"),\n lb.Classification(\n class_type=lb.Classification.Type.CHECKLIST,\n name=\"checklist_audio\",\n options=[\n lb.Option(value=\"first_checklist_answer\"),\n lb.Option(value=\"second_checklist_answer\"),\n ],\n ),\n lb.Classification(\n class_type=lb.Classification.Type.RADIO,\n name=\"radio_audio\",\n options=[\n lb.Option(value=\"first_radio_answer\"),\n lb.Option(value=\"second_radio_answer\"),\n ],\n ),\n])\n\nontology = client.create_ontology(\n \"Ontology Audio Annotations\",\n ontology_builder.asdict(),\n media_type=lb.MediaType.Audio,\n)",
"source": "ontology_builder = lb.OntologyBuilder(classifications=[\n lb.Classification(class_type=lb.Classification.Type.TEXT,\n name=\"text_audio\"),\n lb.Classification(\n class_type=lb.Classification.Type.CHECKLIST,\n name=\"checklist_audio\",\n options=[\n lb.Option(value=\"first_checklist_answer\"),\n lb.Option(value=\"second_checklist_answer\"),\n ],\n ),\n lb.Classification(\n class_type=lb.Classification.Type.RADIO,\n name=\"radio_audio\",\n options=[\n lb.Option(value=\"first_radio_answer\"),\n lb.Option(value=\"second_radio_answer\"),\n ],\n ),\n # Temporal classification for token-level annotations\n lb.Classification(\n class_type=lb.Classification.Type.TEXT,\n name=\"User Speaker\",\n scope=lb.Classification.Scope.INDEX, # INDEX scope for temporal\n ),\n])\n\nontology = client.create_ontology(\n \"Ontology Audio Annotations\",\n ontology_builder.asdict(),\n media_type=lb.MediaType.Audio,\n)",
"cell_type": "code",
"outputs": [],
"execution_count": null
Expand Down Expand Up @@ -223,6 +223,27 @@
],
"cell_type": "markdown"
},
{
"metadata": {},
"source": [
"\n"
],
"cell_type": "markdown"
},
{
"metadata": {},
"source": "",
"cell_type": "code",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"source": "",
"cell_type": "code",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"source": "label = []\nlabel.append(\n lb_types.Label(\n data={\"global_key\": global_key},\n annotations=[text_annotation, checklist_annotation, radio_annotation],\n ))",
Expand Down Expand Up @@ -252,6 +273,29 @@
],
"cell_type": "markdown"
},
{
"metadata": {},
"source": [
"## Temporal Audio Annotations\n",
"\n",
"You can create temporal annotations for individual tokens (words) with precise timing:\n"
],
"cell_type": "markdown"
},
{
"metadata": {},
"source": "# Define tokens with precise timing (from demo script)\ntokens_data = [\n (\"Hello\", 586, 770), # Hello: frames 586-770\n (\"AI\", 771, 955), # AI: frames 771-955\n (\"how\", 956, 1140), # how: frames 956-1140\n (\"are\", 1141, 1325), # are: frames 1141-1325\n (\"you\", 1326, 1510), # you: frames 1326-1510\n (\"doing\", 1511, 1695), # doing: frames 1511-1695\n (\"today\", 1696, 1880), # today: frames 1696-1880\n]\n\n# Create temporal annotations for each token\ntemporal_annotations = []\nfor token, start_frame, end_frame in tokens_data:\n token_annotation = lb_types.AudioClassificationAnnotation(\n frame=start_frame,\n end_frame=end_frame,\n name=\"User Speaker\",\n value=lb_types.Text(answer=token),\n )\n temporal_annotations.append(token_annotation)\n\nprint(f\"Created {len(temporal_annotations)} temporal token annotations\")",
"cell_type": "code",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"source": "# Create label with both regular and temporal annotations\nlabel_with_temporal = []\nlabel_with_temporal.append(\n lb_types.Label(\n data={\"global_key\": global_key},\n annotations=[text_annotation, checklist_annotation, radio_annotation] +\n temporal_annotations,\n ))\n\nprint(\n f\"Created label with {len(label_with_temporal[0].annotations)} total annotations\"\n)\nprint(f\" - Regular annotations: 3\")\nprint(f\" - Temporal annotations: {len(temporal_annotations)}\")",
"cell_type": "code",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"source": [
Expand All @@ -260,6 +304,13 @@
],
"cell_type": "markdown"
},
{
"metadata": {},
"source": "# Upload temporal annotations via MAL\ntemporal_upload_job = lb.MALPredictionImport.create_from_objects(\n client=client,\n project_id=project.uid,\n name=f\"temporal_mal_job-{str(uuid.uuid4())}\",\n predictions=label_with_temporal,\n)\n\ntemporal_upload_job.wait_until_done()\nprint(\"Temporal upload completed!\")\nprint(\"Errors:\", temporal_upload_job.errors)\nprint(\"Status:\", temporal_upload_job.statuses)",
"cell_type": "code",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"source": "# Upload our label using Model-Assisted Labeling\nupload_job = lb.MALPredictionImport.create_from_objects(\n client=client,\n project_id=project.uid,\n name=f\"mal_job-{str(uuid.uuid4())}\",\n predictions=label,\n)\n\nupload_job.wait_until_done()\nprint(\"Errors:\", upload_job.errors)\nprint(\"Status of uploads: \", upload_job.statuses)",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
from .video import MaskInstance
from .video import VideoMaskAnnotation

from .audio import AudioClassificationAnnotation
from .audio import AudioObjectAnnotation

from .ner import ConversationEntity
from .ner import DocumentEntity
from .ner import DocumentTextSelection
Expand Down
63 changes: 63 additions & 0 deletions libs/labelbox/src/labelbox/data/annotation_types/audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from typing import Optional

from labelbox.data.annotation_types.annotation import (
ClassificationAnnotation,
ObjectAnnotation,
)
from labelbox.data.mixins import (
ConfidenceNotSupportedMixin,
CustomMetricsNotSupportedMixin,
)


class AudioClassificationAnnotation(ClassificationAnnotation):
"""Audio classification for specific time range

Examples:
- Speaker identification from 2500ms to 4100ms
- Audio quality assessment for a segment
- Language detection for audio segments

Args:
name (Optional[str]): Name of the classification
feature_schema_id (Optional[Cuid]): Feature schema identifier
value (Union[Text, Checklist, Radio]): Classification value
frame (int): The frame index in milliseconds (e.g., 2500 = 2.5 seconds)
end_frame (Optional[int]): End frame in milliseconds (for time ranges)
segment_index (Optional[int]): Index of audio segment this annotation belongs to
extra (Dict[str, Any]): Additional metadata
"""

frame: int
end_frame: Optional[int] = None
segment_index: Optional[int] = None


class AudioObjectAnnotation(
ObjectAnnotation,
ConfidenceNotSupportedMixin,
CustomMetricsNotSupportedMixin,
):
"""Audio object annotation for specific time range

Examples:
- Transcription: "Hello world" from 2500ms to 4100ms
- Sound events: "Dog barking" from 10000ms to 12000ms
- Audio segments with metadata

Args:
name (Optional[str]): Name of the annotation
feature_schema_id (Optional[Cuid]): Feature schema identifier
value (Union[TextEntity, Geometry]): Localization or text content
frame (int): The frame index in milliseconds (e.g., 10000 = 10.0 seconds)
end_frame (Optional[int]): End frame in milliseconds (for time ranges)
keyframe (bool): Whether this is a keyframe annotation (default: True)
segment_index (Optional[int]): Index of audio segment this annotation belongs to
classifications (Optional[List[ClassificationAnnotation]]): Optional sub-classifications
extra (Dict[str, Any]): Additional metadata
"""

frame: int
end_frame: Optional[int] = None
keyframe: bool = True
segment_index: Optional[int] = None
26 changes: 26 additions & 0 deletions libs/labelbox/src/labelbox/data/annotation_types/label.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .metrics import ScalarMetric, ConfusionMatrixMetric
from .video import VideoClassificationAnnotation
from .video import VideoObjectAnnotation, VideoMaskAnnotation
from .audio import AudioClassificationAnnotation, AudioObjectAnnotation
from .mmc import MessageEvaluationTaskAnnotation
from pydantic import BaseModel, field_validator

Expand Down Expand Up @@ -44,6 +45,8 @@ class Label(BaseModel):
ClassificationAnnotation,
ObjectAnnotation,
VideoMaskAnnotation,
AudioClassificationAnnotation,
AudioObjectAnnotation,
ScalarMetric,
ConfusionMatrixMetric,
RelationshipAnnotation,
Expand Down Expand Up @@ -85,6 +88,29 @@ def frame_annotations(
frame_dict[annotation.frame].append(annotation)
return frame_dict

def audio_annotations_by_frame(
self,
) -> Dict[
int, List[Union[AudioObjectAnnotation, AudioClassificationAnnotation]]
]:
"""Get audio annotations organized by frame (millisecond)

Returns:
Dict[int, List]: Dictionary mapping frame (milliseconds) to list of audio annotations

Example:
>>> label.audio_annotations_by_frame()
{2500: [AudioClassificationAnnotation(...)], 10000: [AudioObjectAnnotation(...)]}
"""
frame_dict = defaultdict(list)
for annotation in self.annotations:
if isinstance(
annotation,
(AudioObjectAnnotation, AudioClassificationAnnotation),
):
frame_dict[annotation.frame].append(annotation)
return dict(frame_dict)

def add_url_to_masks(self, signer) -> "Label":
"""
Creates signed urls for all masks in the Label.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from ...annotation_types.annotation import ClassificationAnnotation
from ...annotation_types.video import VideoClassificationAnnotation
from ...annotation_types.audio import AudioClassificationAnnotation
from ...annotation_types.llm_prompt_response.prompt import (
PromptClassificationAnnotation,
PromptText,
Expand Down Expand Up @@ -223,7 +224,7 @@ def from_common(
# ====== End of subclasses


class NDText(NDAnnotation, NDTextSubclass):
class NDText(NDAnnotation, NDTextSubclass, VideoSupported):
@classmethod
def from_common(
cls,
Expand All @@ -242,6 +243,7 @@ def from_common(
name=name,
schema_id=feature_schema_id,
uuid=uuid,
frames=extra.get("frames"),
message_id=message_id,
confidence=text.confidence,
custom_metrics=text.custom_metrics,
Expand Down Expand Up @@ -399,7 +401,11 @@ class NDClassification:
@staticmethod
def to_common(
annotation: "NDClassificationType",
) -> Union[ClassificationAnnotation, VideoClassificationAnnotation]:
) -> Union[
ClassificationAnnotation,
VideoClassificationAnnotation,
AudioClassificationAnnotation,
]:
common = ClassificationAnnotation(
value=annotation.to_common(),
name=annotation.name,
Expand All @@ -414,18 +420,35 @@ def to_common(
results = []
for frame in annotation.frames:
for idx in range(frame.start, frame.end + 1, 1):
results.append(
VideoClassificationAnnotation(
frame=idx, **common.model_dump(exclude_none=True)
# Check if this is an audio annotation by looking at the extra data
# Audio annotations will have frame/end_frame in extra, video annotations won't
if (
hasattr(annotation, "extra")
and annotation.extra
and "frames" in annotation.extra
):
# This is likely an audio temporal annotation
results.append(
AudioClassificationAnnotation(
frame=idx, **common.model_dump(exclude_none=True)
)
)
else:
# This is a video temporal annotation
results.append(
VideoClassificationAnnotation(
frame=idx, **common.model_dump(exclude_none=True)
)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Temporal Annotation Classification Fails

The NDClassification.to_common method uses a fragile heuristic to distinguish between audio and video temporal annotations. It checks for "frames" in annotation.extra, but both annotation types can contain frame data there. This unreliable check can lead to incorrect classification and downstream processing errors.

Fix in CursorΒ Fix in Web

)
)
return results

@classmethod
def from_common(
cls,
annotation: Union[
ClassificationAnnotation, VideoClassificationAnnotation
ClassificationAnnotation,
VideoClassificationAnnotation,
AudioClassificationAnnotation,
],
data: GenericDataRowData,
) -> Union[NDTextSubclass, NDChecklistSubclass, NDRadioSubclass]:
Expand All @@ -448,7 +471,9 @@ def from_common(
@staticmethod
def lookup_classification(
annotation: Union[
ClassificationAnnotation, VideoClassificationAnnotation
ClassificationAnnotation,
VideoClassificationAnnotation,
AudioClassificationAnnotation,
],
) -> Union[NDText, NDChecklist, NDRadio]:
return {Text: NDText, Checklist: NDChecklist, Radio: NDRadio}.get(
Expand Down
Loading