Skip to content

Commit

Permalink
fix for merlin writer issues
Browse files Browse the repository at this point in the history
  • Loading branch information
jperez999 committed Jan 17, 2024
1 parent 8e6a737 commit 1d48e60
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 2 deletions.
2 changes: 1 addition & 1 deletion merlin/core/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from merlin.core.has_gpu import HAS_GPU # noqa pylint: disable=unused-import

cuda = False if not HAS_GPU else cuda
cuda = None if not HAS_GPU else cuda

try:
import psutil
Expand Down
6 changes: 5 additions & 1 deletion merlin/io/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from merlin.core.compat import cupy as cp
from merlin.core.dispatch import annotate
from merlin.io.shuffle import shuffle_df
import logging


class Writer:
Expand Down Expand Up @@ -193,10 +194,13 @@ def _add_data_slice(self, df):
# Pandas does not support the `scatter_by_map` method
# used in `_add_data_scatter`. So, we manually shuffle
# the df and write out slices.
logging.error(f"df: {df.shape}, {self.num_out_files}")
if self.shuffle:
df = shuffle_df(df)
logging.error(f"df: {df.shape}, {self.num_out_files}")
int_slice_size = df.shape[0] // self.num_out_files
slice_size = int_slice_size if df.shape[0] % int_slice_size == 0 else int_slice_size + 1
logging.error(f"df: {df.shape}, {self.num_out_files}, {int_slice_size}")
slice_size = int_slice_size if int_slice_size > 0 and df.shape[0] % int_slice_size == 0 else int_slice_size + 1
for x in range(self.num_out_files):
start = x * slice_size
end = start + slice_size
Expand Down

0 comments on commit 1d48e60

Please sign in to comment.