Skip to content

Commit

Permalink
feat: rust generator (#49)
Browse files Browse the repository at this point in the history
supports generating `safetensor` chunks on mac (single worker) via rust
binary
  • Loading branch information
mosure authored Aug 27, 2024
2 parents 495710f + 05a72ed commit e96454c
Show file tree
Hide file tree
Showing 11 changed files with 712 additions and 100 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ from torch.utils.data import DataLoader

dataset = BevyZeroverseDataset(
editor=False, headless=True, num_cameras=6,
width=640, height=360, num_samples=1e6,
width=640, height=480, num_samples=1e6,
)
dataloader = DataLoader(
dataset, batch_size=4, shuffle=True, num_workers=1,
Expand Down
15 changes: 13 additions & 2 deletions ffi/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
[package]
name = "bevy_zeroverse_ffi"
version = "0.1.0"
version = "0.3.0"
edition = "2021"
default-run = "generate"


[features]
Expand All @@ -14,12 +15,17 @@ extension-module = [


[dependencies]
bevy_args = "1.6"
bevy_zeroverse = { path = "../", default-features = false, features = ["python"] }
bytemuck = "1.17"
clap = { version = "4.4", features = ["derive"] }
image = "0.25"
ndarray = { version = "0.15", features = ["blas"] }
once_cell = "1.19"
pyo3 = { version = "0.22", features = ["macros"] }
pyo3-log = "0.11"
safetensors = "0.4"
serde = "1.0"


[dependencies.bevy]
Expand All @@ -34,5 +40,10 @@ features = [


[lib]
name = "bevy_zeroverse"
name = "bevy_zeroverse_ffi"
path = "src/lib.rs"


[[bin]]
name = "generate"
path = "src/generate.rs"
10 changes: 8 additions & 2 deletions ffi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ from torch.utils.data import DataLoader

dataset = BevyZeroverseDataset(
editor=False, headless=True, num_cameras=6,
width=640, height=360, num_samples=1e6,
width=640, height=480, num_samples=1e6,
)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, num_workers=1)

Expand All @@ -33,7 +33,13 @@ for batch in dataloader:
```


<!-- ### macos setup -->
### macos setup

macos does not support running the generator off main thread. right now, the only way to generate on mac is from rust. e.g.

```bash
cargo run -p bevy_zeroverse_ffi --bin generate -- --help
```

<!-- ```bash
LIBTORCH_PATH=$(python3 -c "import site; print(site.getsitepackages()[0] + '/torch/lib')")
Expand Down
72 changes: 49 additions & 23 deletions ffi/python/bevy_zeroverse_dataloader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,19 @@
import torch
from torch.utils.data import DataLoader, Dataset

import bevy_zeroverse
import bevy_zeroverse_ffi


# TODO: add sample-level world rotation augment
class View:
def __init__(self, color, depth, normal, world_from_view, fovy, width, height):
def __init__(self, color, depth, normal, world_from_view, fovy, near, far, width, height):
self.color = color
self.depth = depth
self.normal = normal
self.world_from_view = world_from_view
self.fovy = fovy
self.near = near
self.far = far
self.width = width
self.height = height

Expand All @@ -42,43 +44,55 @@ def reshape_data(data, dtype):

world_from_view = np.array(rust_view.world_from_view)
fovy = rust_view.fovy
return cls(color, depth, normal, world_from_view, fovy, width, height)
near = rust_view.near
far = rust_view.far
return cls(color, depth, normal, world_from_view, fovy, near, far, width, height)

def to_tensors(self):
color_tensor = torch.tensor(self.color, dtype=torch.float32)
depth_tensor = torch.tensor(self.depth, dtype=torch.float32)
normal_tensor = torch.tensor(self.normal, dtype=torch.float32)

color_tensor = color_tensor[..., :3]
depth_tensor = depth_tensor[..., 0]
depth_tensor = depth_tensor[..., 0:1]
normal_tensor = normal_tensor[..., :3]

world_from_view_tensor = torch.tensor(self.world_from_view, dtype=torch.float32)
fovy_tensor = torch.tensor(self.fovy, dtype=torch.float32)

fovy_tensor = torch.tensor(self.fovy, dtype=torch.float32).unsqueeze(-1)
near_tensor = torch.tensor(self.near, dtype=torch.float32).unsqueeze(-1)
far_tensor = torch.tensor(self.far, dtype=torch.float32).unsqueeze(-1)

return {
'color': color_tensor,
'depth': depth_tensor,
'normal': normal_tensor,
'world_from_view': world_from_view_tensor,
'fovy': fovy_tensor
'fovy': fovy_tensor,
'near': near_tensor,
'far': far_tensor,
}

class Sample:
def __init__(self, views):
def __init__(self, views, aabb):
self.views = views
self.aabb = aabb

@classmethod
def from_rust(cls, rust_sample, width, height):
views = [View.from_rust(view, width, height) for view in rust_sample.views]
return cls(views)
aabb = np.array(rust_sample.aabb)
return cls(views, aabb)

def to_tensors(self):
tensor_dict = {
'color': [],
'depth': [],
'normal': [],
'world_from_view': [],
'fovy': []
'fovy': [],
'near': [],
'far': [],
}

if len(self.views) == 0:
Expand All @@ -93,14 +107,16 @@ def to_tensors(self):
for key in tensor_dict:
tensor_dict[key] = torch.stack(tensor_dict[key], dim=0)

tensor_dict['aabb'] = torch.tensor(self.aabb, dtype=torch.float32)

return tensor_dict


# TODO: add dataset seed parameter to config
class BevyZeroverseDataset(Dataset):
scene_map = {
'object': bevy_zeroverse.ZeroverseSceneType.Object,
'room': bevy_zeroverse.ZeroverseSceneType.Room,
'object': bevy_zeroverse_ffi.ZeroverseSceneType.Object,
'room': bevy_zeroverse_ffi.ZeroverseSceneType.Room,
}

def __init__(
Expand All @@ -125,15 +141,15 @@ def __init__(
self.scene_type = scene_type

def initialize(self):
config = bevy_zeroverse.BevyZeroverseConfig()
config = bevy_zeroverse_ffi.BevyZeroverseConfig()
config.editor = self.editor
config.headless = self.headless
config.num_cameras = self.num_cameras
config.width = self.width
config.height = self.height
config.scene_type = BevyZeroverseDataset.scene_map[self.scene_type]
config.regenerate_scene_material_shuffle_period = 256
bevy_zeroverse.initialize(
bevy_zeroverse_ffi.initialize(
config,
self.root_asset_folder,
)
Expand All @@ -146,7 +162,7 @@ def __getitem__(self, idx):
if not self.initialized:
self.initialize()

rust_sample = bevy_zeroverse.next()
rust_sample = bevy_zeroverse_ffi.next()
sample = Sample.from_rust(rust_sample, self.width, self.height)
return sample.to_tensors()

Expand All @@ -157,18 +173,24 @@ def chunk_and_save(
bytes_per_chunk: int = int(256 * 1024 * 1024),
n_workers: int = 1,
):
output_dir.mkdir(exist_ok=True, parents=True)
existing_chunks = sorted(output_dir.glob("*.safetensors"))
if existing_chunks:
latest_chunk = existing_chunks[-1]
chunk_index = int(latest_chunk.stem)
print(f"resuming from chunk {chunk_index}.")
else:
chunk_index = 0

chunk_size = 0
chunk_index = 0
chunk = []
original_samples = []
chunk_file_paths = []
chunk_file_paths = [output_dir / f"{int(chunk.stem):0>6}.safetensors" for chunk in existing_chunks]

def save_chunk():
nonlocal chunk_size, chunk_index, chunk, original_samples, chunk_file_paths
nonlocal chunk_size, chunk_index, chunk, chunk_file_paths

chunk_key = f"{chunk_index:0>6}"
print(f"saving chunk {chunk_key} of {len(dataset)} ({chunk_size / 1e6:.2f} MB).")
output_dir.mkdir(exist_ok=True, parents=True)
file_path = output_dir / f"{chunk_key}.safetensors"

batch = {}
Expand All @@ -187,13 +209,17 @@ def save_chunk():
chunk_index += 1
chunk = []

del batch
torch.cuda.empty_cache()
import gc
gc.collect()

dataloader = DataLoader(dataset, batch_size=1, num_workers=n_workers, shuffle=False)

for idx, sample in enumerate(dataloader):
sample = {k: v.squeeze(0) for k, v in sample.items()}
sample_size = sum(tensor.numel() * tensor.element_size() for tensor in sample.values())
chunk.append(sample)
original_samples.append(sample)
chunk_size += sample_size

print(f" added sample {idx} to chunk ({sample_size / 1e6:.2f} MB).")
Expand All @@ -203,13 +229,13 @@ def save_chunk():
if chunk_size > 0:
save_chunk()

return original_samples, chunk_file_paths
return chunk_file_paths


class ChunkedDataset(Dataset):
def __init__(self, output_dir: Path):
self.output_dir = output_dir
self.chunk_files = sorted(output_dir.glob("*.safetensors"))
self.output_dir = Path(output_dir)
self.chunk_files = sorted(self.output_dir.glob("*.safetensors"))

def load_chunk(self, file_path: Path):
with safe_open(str(file_path), framework="pt", device="cpu") as f:
Expand Down
5 changes: 1 addition & 4 deletions ffi/python/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,10 @@ def generate_chunked_dataset(
headless=True,
num_cameras=4,
width=640,
height=360,
height=480,
num_samples=100,
)
) -> list:
if output_dir.exists():
shutil.rmtree(output_dir)

return chunk_and_save(
dataset,
output_dir,
Expand Down
37 changes: 10 additions & 27 deletions ffi/python/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,17 @@
def visualize(batch):
print(batch['color'].shape)

is_chunked = len(batch['color'].shape) == 6

color_images = batch['color'].numpy()
depth_images = batch['depth'].numpy()
normal_images = batch['normal'].numpy()

if is_chunked:
color_images = color_images.squeeze(0)
depth_images = depth_images.squeeze(0)
normal_images = normal_images.squeeze(0)

batch_size = color_images.shape[0]
num_cameras = color_images.shape[1]
num_image_types = 3 # color, depth, normal
Expand Down Expand Up @@ -82,7 +89,7 @@ def benchmark(dataloader):
def test():
dataset = BevyZeroverseDataset(
editor=False, headless=True, num_cameras=6,
width=640, height=360, num_samples=1e6,
width=640, height=480, num_samples=1e6,
)
dataloader = DataLoader(dataset, batch_size=5, shuffle=True, num_workers=2)

Expand All @@ -99,7 +106,7 @@ def setUp(self):
self.headless = True
self.num_cameras = 4
self.width = 640
self.height = 360
self.height = 480
self.num_samples = 10
self.bytes_per_chunk = int(256 * 1024 * 1024)
self.stage = "test"
Expand All @@ -109,31 +116,7 @@ def setUp(self):
shutil.rmtree(self.output_dir)

self.dataset = BevyZeroverseDataset(self.editor, self.headless, self.num_cameras, self.width, self.height, self.num_samples)
self.original_samples = chunk_and_save(self.dataset, self.output_dir, self.bytes_per_chunk)

def test_chunked_dataset_loading(self):
chunked_dataset = ChunkedDataset(self.output_dir)
dataloader = DataLoader(chunked_dataset, batch_size=1, shuffle=False)

num_chunks = 0
total_loaded_samples = 0

expected_shapes = {key: tensor.shape for key, tensor in self.original_samples[0][0].items()}

for batch in dataloader:
num_chunks += 1

for key, tensor in batch.items():
tensor = tensor.squeeze(0)
expected_shape = (tensor.shape[0],) + expected_shapes[key]
self.assertEqual(tensor.shape, expected_shape, f"Mismatch in tensor shape for key {key}")

total_loaded_samples += batch['color'].squeeze(0).shape[0]

expected_num_chunks = len(chunked_dataset)
self.assertEqual(num_chunks, expected_num_chunks, "Mismatch in number of chunks")

self.assertEqual(total_loaded_samples, len(self.original_samples[0]))
self.chunk_paths = chunk_and_save(self.dataset, self.output_dir, self.bytes_per_chunk)

def test_benchmark_chunked_dataloader(self):
chunked_dataset = ChunkedDataset(self.output_dir)
Expand Down
17 changes: 17 additions & 0 deletions ffi/python/view.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from torch.utils.data import DataLoader

from bevy_zeroverse_dataloader import ChunkedDataset

from test import visualize


def main():
chunked_dataset = ChunkedDataset("data/zeroverse/rust")
dataloader = DataLoader(chunked_dataset, batch_size=1, shuffle=False)

for batch in dataloader:
visualize(batch)


if __name__ == "__main__":
main()
6 changes: 3 additions & 3 deletions ffi/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@


setup(
name="bevy_zeroverse",
version="0.1",
name="bevy_zeroverse_ffi",
version="0.3",
packages=find_packages(where="python"),
package_dir={"": "python"},
rust_extensions=[
RustExtension("bevy_zeroverse")
RustExtension("bevy_zeroverse_ffi")
],
include_package_data=True,
zip_safe=False,
Expand Down
Loading

0 comments on commit e96454c

Please sign in to comment.