Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rfc: merge upload_examples/_multipart #1477

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion python/docs/create_api_rst.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Script for auto-generating api_reference.rst."""

Check notice on line 1 in python/docs/create_api_rst.py

View workflow job for this annotation

GitHub Actions / benchmark

Benchmark results

........... WARNING: the benchmark result may be unstable * the standard deviation (75.0 ms) is 11% of the mean (690 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. create_5_000_run_trees: Mean +- std dev: 690 ms +- 75 ms ........... create_10_000_run_trees: Mean +- std dev: 1.36 sec +- 0.12 sec ........... create_20_000_run_trees: Mean +- std dev: 2.68 sec +- 0.14 sec ........... dumps_class_nested_py_branch_and_leaf_200x400: Mean +- std dev: 703 us +- 8 us ........... dumps_class_nested_py_leaf_50x100: Mean +- std dev: 25.2 ms +- 0.3 ms ........... dumps_class_nested_py_leaf_100x200: Mean +- std dev: 105 ms +- 4 ms ........... dumps_dataclass_nested_50x100: Mean +- std dev: 25.5 ms +- 0.2 ms ........... WARNING: the benchmark result may be unstable * the standard deviation (17.0 ms) is 24% of the mean (71.3 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydantic_nested_50x100: Mean +- std dev: 71.3 ms +- 17.0 ms ........... dumps_pydanticv1_nested_50x100: Mean +- std dev: 194 ms +- 3 ms

Check notice on line 1 in python/docs/create_api_rst.py

View workflow job for this annotation

GitHub Actions / benchmark

Comparison against main

+-----------------------------------------------+----------+------------------------+ | Benchmark | main | changes | +===============================================+==========+========================+ | dumps_pydanticv1_nested_50x100 | 223 ms | 194 ms: 1.15x faster | +-----------------------------------------------+----------+------------------------+ | create_20_000_run_trees | 2.75 sec | 2.68 sec: 1.03x faster | +-----------------------------------------------+----------+------------------------+ | create_10_000_run_trees | 1.39 sec | 1.36 sec: 1.02x faster | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_leaf_50x100 | 25.4 ms | 25.2 ms: 1.01x faster | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_branch_and_leaf_200x400 | 708 us | 703 us: 1.01x faster | +-----------------------------------------------+----------+------------------------+ | create_5_000_run_trees | 694 ms | 690 ms: 1.01x faster | +-----------------------------------------------+----------+------------------------+ | dumps_dataclass_nested_50x100 | 25.6 ms | 25.5 ms: 1.00x faster | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_leaf_100x200 | 104 ms | 105 ms: 1.01x slower | +-----------------------------------------------+----------+------------------------+ | dumps_pydantic_nested_50x100 | 69.6 ms | 71.3 ms: 1.03x slower | +-----------------------------------------------+----------+------------------------+ | Geometric mean | (ref) | 1.02x faster | +-----------------------------------------------+----------+------------------------+

from __future__ import annotations

Expand Down Expand Up @@ -111,7 +111,9 @@
else (
"enum"
if issubclass(type_, Enum)
else "Pydantic" if issubclass(type_, BaseModel) else "Regular"
else "Pydantic"
if issubclass(type_, BaseModel)
else "Regular"
)
)
classes_.append(
Expand Down
162 changes: 82 additions & 80 deletions python/langsmith/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3897,9 +3897,9 @@ def create_example_from_run(
def _prepare_multipart_data(
self,
examples: Union[
List[ls_schemas.ExampleUploadWithAttachments]
List[ls_schemas.ExampleCreate]
| List[ls_schemas.ExampleUpsertWithAttachments]
| List[ls_schemas.ExampleUpdateWithAttachments],
| List[ls_schemas.ExampleUpdate],
],
include_dataset_id: bool = False,
dangerously_allow_filesystem: bool = False,
Expand All @@ -3916,21 +3916,21 @@ def _prepare_multipart_data(

for example in examples:
if (
not isinstance(example, ls_schemas.ExampleUploadWithAttachments)
not isinstance(example, ls_schemas.ExampleCreate)
and not isinstance(example, ls_schemas.ExampleUpsertWithAttachments)
and not isinstance(example, ls_schemas.ExampleUpdateWithAttachments)
and not isinstance(example, ls_schemas.ExampleUpdate)
):
raise ValueError(
"The examples must be of type ExampleUploadWithAttachments"
"The examples must be of type ExampleCreate"
" or ExampleUpsertWithAttachments"
" or ExampleUpdateWithAttachments"
" or ExampleUpdate"
)
if example.id is not None:
example_id = str(example.id)
else:
example_id = str(uuid.uuid4())

if isinstance(example, ls_schemas.ExampleUpdateWithAttachments):
if isinstance(example, ls_schemas.ExampleUpdate):
created_at = None
else:
created_at = example.created_at
Expand Down Expand Up @@ -4024,7 +4024,7 @@ def _prepare_multipart_data(
)

if (
isinstance(example, ls_schemas.ExampleUpdateWithAttachments)
isinstance(example, ls_schemas.ExampleUpdate)
and example.attachments_operations
):
attachments_operationsb = _dumps_json(example.attachments_operations)
Expand Down Expand Up @@ -4052,14 +4052,14 @@ def update_examples_multipart(
self,
*,
dataset_id: ID_TYPE,
updates: Optional[List[ls_schemas.ExampleUpdateWithAttachments]] = None,
updates: Optional[List[ls_schemas.ExampleUpdate]] = None,
dangerously_allow_filesystem: bool = False,
) -> ls_schemas.UpsertExamplesResponse:
"""Update examples using multipart.

Args:
dataset_id (Union[UUID, str]): The ID of the dataset to update.
updates (Optional[List[ExampleUpdateWithAttachments]]): The updates to apply to the examples.
updates (Optional[List[ExampleUpdate]]): The updates to apply to the examples.

Raises:
ValueError: If the multipart examples endpoint is not enabled.
Expand Down Expand Up @@ -4100,14 +4100,14 @@ def upload_examples_multipart(
self,
*,
dataset_id: ID_TYPE,
uploads: Optional[List[ls_schemas.ExampleUploadWithAttachments]] = None,
uploads: Optional[List[ls_schemas.ExampleCreate]] = None,
dangerously_allow_filesystem: bool = False,
) -> ls_schemas.UpsertExamplesResponse:
"""Upload examples using multipart.

Args:
dataset_id (Union[UUID, str]): The ID of the dataset to upload to.
uploads (Optional[List[ExampleUploadWithAttachments]]): The examples to upload.
uploads (Optional[List[ExampleCreate]]): The examples to upload.

Returns:
ls_schemas.UpsertExamplesResponse: The count and ids of the successfully uploaded examples
Expand Down Expand Up @@ -4326,7 +4326,7 @@ def create_example(
if dataset_id is None:
dataset_id = self.read_dataset(dataset_name=dataset_name).id

data = {
data: ls_schemas.ExampleCreate = {
"inputs": inputs,
"outputs": outputs,
"dataset_id": dataset_id,
Expand All @@ -4335,20 +4335,11 @@ def create_example(
"source_run_id": source_run_id,
}
if created_at:
data["created_at"] = created_at.isoformat()
data["id"] = example_id or str(uuid.uuid4())
response = self.request_with_retries(
"POST",
"/examples",
headers={**self._headers, "Content-Type": "application/json"},
data=_dumps_json({k: v for k, v in data.items() if v is not None}),
)
ls_utils.raise_for_status_with_text(response)
result = response.json()
return ls_schemas.Example(
**result,
_host_url=self._host_url,
_tenant_id=self._get_optional_tenant_id(),
data.created_at = created_at.isoformat()
data.id = example_id or str(uuid.uuid4())
return self.upload_examples_multipart(
dataset_id=dataset_id,
uploads=[data]
)

def read_example(
Expand Down Expand Up @@ -4696,28 +4687,27 @@ def update_example(
split=split,
attachments_operations=attachments_operations,
)
example = {k: v for k, v in example.items() if v is not None}
response = self.request_with_retries(
"PATCH",
f"/examples/{_as_uuid(example_id, 'example_id')}",
headers={**self._headers, "Content-Type": "application/json"},
data=_dumps_json({k: v for k, v in example.items() if v is not None}),
)
ls_utils.raise_for_status_with_text(response)
return response.json()
example: ls_schemas.ExampleUpdate = {k: v for k, v in example.items() if v is not None}

if dataset_id is not None:
return self.update_examples_multipart(
dataset_id=dataset_id,
updates=[example]
)
else:
dataset_id = [e for e in self.list_examples(example_ids=[example_id])][0].dataset_id
return self.update_examples_multipart(
dataset_id=dataset_id,
updates=[example]
)

def update_examples(
self,
*,
example_ids: Sequence[ID_TYPE],
inputs: Optional[Sequence[Optional[Dict[str, Any]]]] = None,
outputs: Optional[Sequence[Optional[Mapping[str, Any]]]] = None,
metadata: Optional[Sequence[Optional[Dict]]] = None,
splits: Optional[Sequence[Optional[str | List[str]]]] = None,
dataset_ids: Optional[Sequence[Optional[ID_TYPE]]] = None,
attachments_operations: Optional[
Sequence[Optional[ls_schemas.AttachmentsOperations]]
] = None,
dataset_id: ID_TYPE | None = None,
updates: Optional[List[ls_schemas.ExampleUpdate]] = None,
dangerously_allow_filesystem: bool = False,
**kwargs: Any,
) -> Dict[str, Any]:
"""Update multiple examples.

Expand All @@ -4741,64 +4731,76 @@ def update_examples(
Returns:
Dict[str, Any]: The response from the server (specifies the number of examples updated).
"""
if attachments_operations is not None:
if kwargs and any([dataset_id, updates]):
raise ValueError("When passing kwargs, you must not pass dataset_id, or updates")
elif kwargs and not kwargs.get("example_ids"):
raise ValueError("When passing kwargs, you must pass example_ids")
elif kwargs:
example_ids = kwargs.pop("example_ids")
else:
response = self.update_examples_multipart(
dataset_id=dataset_id,
updates=updates,
dangerously_allow_filesystem=dangerously_allow_filesystem,
)
return {"message": f"{response.get('count', 0)} examples updated", **response}

if kwargs.get("attachments_operations") is not None or kwargs.get("attachments") is not None:
if not (self.info.instance_flags or {}).get(
"dataset_examples_multipart_enabled", False
):
raise ValueError(
"Your LangSmith version does not allow using the attachment operations, please update to the latest version."
)
sequence_args = {
"inputs": inputs,
"outputs": outputs,
"metadata": metadata,
"splits": splits,
"dataset_ids": dataset_ids,
"attachments_operations": attachments_operations,
}
# Since inputs are required, we will check against them
# Since ids are required, we will check against them
examples_len = len(example_ids)
for arg_name, arg_value in sequence_args.items():
for arg_name, arg_value in kwargs.items():
if arg_value is not None and len(arg_value) != examples_len:
raise ValueError(
f"Length of {arg_name} ({len(arg_value)}) does not match"
f" length of examples ({examples_len})"
)
examples = [
{
ls_schemas.ExampleUpdate(**{
"id": id_,
"inputs": in_,
"outputs": out_,
"dataset_id": dataset_id_,
"metadata": metadata_,
"split": split_,
"attachments": attachments_,
"attachments_operations": attachments_operations_,
}
for id_, in_, out_, metadata_, split_, dataset_id_, attachments_operations_ in zip(
})
for id_, in_, out_, metadata_, split_, dataset_id_, attachments_, attachments_operations_ in zip(
example_ids,
inputs or [None] * len(example_ids),
outputs or [None] * len(example_ids),
metadata or [None] * len(example_ids),
splits or [None] * len(example_ids),
dataset_ids or [None] * len(example_ids),
attachments_operations or [None] * len(example_ids),
kwargs.get("inputs", [None] * len(example_ids)),
kwargs.get("outputs", [None] * len(example_ids)),
kwargs.get("metadata", [None] * len(example_ids)),
kwargs.get("splits", [None] * len(example_ids)),
kwargs.get("dataset_ids", [None] * len(example_ids)),
kwargs.get("attachments", [None] * len(example_ids)),
kwargs.get("attachments_operations", [None] * len(example_ids)),
)
]
response = self.request_with_retries(
"PATCH",
"/examples/bulk",
headers={**self._headers, "Content-Type": "application/json"},
data=(
_dumps_json(
[
{k: v for k, v in example.items() if v is not None}
for example in examples
]
)
),
)
ls_utils.raise_for_status_with_text(response)
return response.json()
if "dataset_ids" not in kwargs:
# get dataset_id of first example, assume it works for all
dataset_id = [e for e in self.list_examples(example_ids=[example_ids[0]])][0].dataset_id
response = self.update_examples_multipart(
dataset_id=dataset_id,
updates=examples,
dangerously_allow_filesystem=dangerously_allow_filesystem,
)
else:
if len(set(kwargs["dataset_ids"])) > 1:
raise ValueError("Dataset IDs must be the same for all examples")
dataset_id = kwargs["dataset_ids"][0]
response = self.update_examples_multipart(
dataset_id=dataset_id,
updates=examples,
dangerously_allow_filesystem=dangerously_allow_filesystem,
)

return {"message": f"{response.get('count', 0)} examples updated", **response}

def delete_example(self, example_id: ID_TYPE) -> None:
"""Delete an example by ID.
Expand Down
41 changes: 15 additions & 26 deletions python/langsmith/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,7 @@ class Config:
arbitrary_types_allowed = True


class ExampleCreate(ExampleBase):
"""Example create model."""

id: Optional[UUID]
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
split: Optional[Union[str, List[str]]] = None


class ExampleUploadWithAttachments(BaseModel):
class ExampleCreate(BaseModel):
"""Example upload with attachments."""

id: Optional[UUID]
Expand All @@ -121,8 +113,11 @@ class ExampleUploadWithAttachments(BaseModel):
split: Optional[Union[str, List[str]]] = None
attachments: Optional[Attachments] = None

def __init__(self, **data):
super().__init__(**data)


class ExampleUpsertWithAttachments(ExampleUploadWithAttachments):
class ExampleUpsertWithAttachments(ExampleCreate):
"""Example create with attachments."""

dataset_id: UUID
Expand Down Expand Up @@ -195,33 +190,27 @@ class AttachmentsOperations(BaseModel):
default_factory=list, description="List of attachment names to keep"
)


class ExampleUpdate(BaseModel):
"""Update class for Example."""
"""Example update with attachments."""

id: UUID
dataset_id: Optional[UUID] = None
inputs: Optional[Dict[str, Any]] = None
outputs: Optional[Dict[str, Any]] = None
attachments_operations: Optional[AttachmentsOperations] = None
metadata: Optional[Dict[str, Any]] = None
inputs: Dict[str, Any] = Field(default_factory=dict)
outputs: Optional[Dict[str, Any]] = Field(default=None)
metadata: Optional[Dict[str, Any]] = Field(default=None)
split: Optional[Union[str, List[str]]] = None
attachments: Optional[Attachments] = None
attachments_operations: Optional[AttachmentsOperations] = None

class Config:
"""Configuration class for the schema."""

frozen = True

def __init__(self, **data):
super().__init__(**data)

class ExampleUpdateWithAttachments(ExampleUpdate):
"""Example update with attachments."""

id: UUID
inputs: Dict[str, Any] = Field(default_factory=dict)
outputs: Optional[Dict[str, Any]] = Field(default=None)
metadata: Optional[Dict[str, Any]] = Field(default=None)
split: Optional[Union[str, List[str]]] = None
attachments: Optional[Attachments] = None
attachments_operations: Optional[AttachmentsOperations] = None
ExampleUpdateWithAttachments = ExampleUpdate


class DataType(str, Enum):
Expand Down
Loading
Loading