Skip to content

Commit

Permalink
Use np.int64 type for day to nanosecond conversion (NEP50) (#922)
Browse files Browse the repository at this point in the history
* Use np.int64 type for day to nanosecond conversion (NEP50)

* Ditch dask CI, fix pandas

---------

Co-authored-by: Martin Durant <[email protected]>
  • Loading branch information
bnavigator and martindurant authored May 8, 2024
1 parent ec26733 commit bb00f37
Show file tree
Hide file tree
Showing 7 changed files with 9 additions and 47 deletions.
30 changes: 0 additions & 30 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,36 +70,6 @@ jobs:
run: |
echo "FASTPARQUET_DATAPAGE_V2=$FASTPARQUET_DATAPAGE_V2"
pytest --verbose --cov=fastparquet
dask:
name: dask
runs-on: ubuntu-latest
steps:
- name: APT
run: sudo apt-get install liblzo2-dev

- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Setup conda
uses: mamba-org/provision-with-micromamba@main
with:
environment-file: ci/environment-py39.yml

- name: pip-install
shell: bash -l {0}
run: |
git clone https://github.com/dask/dask
pip install pyarrow
pip install -e dask/
pip install -e . --no-deps
- name: Run Tests
shell: bash -l {0}
run: |
pytest --verbose dask/dask/dataframe/io/tests/test_parquet.py
pandas:
name: pandas
runs-on: ubuntu-latest
Expand Down
9 changes: 5 additions & 4 deletions fastparquet/converted_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ def unbson(x):
def tobson(x):
raise ImportError("BSON not found")

DAYS_TO_MILLIS = 86400000000000
"""Number of millis in a day. Used to convert a Date to a date"""
# Explicitly use numpy type in order to avoid promotion errors due to NEP 50 in numpy >= 2
DAYS_TO_NANOS = np.int64(86400000000000)
"""Number of nanoseconds in a day. Used to convert a Date to a date"""
nat = np.datetime64('NaT').view('int64')

simple = {
Expand Down Expand Up @@ -158,7 +159,7 @@ def convert(data, se, timestamp96=True, dtype=None):
if se.type == parquet_thrift.Type.INT96 and timestamp96:
data2 = data.view([('ns', 'i8'), ('day', 'i4')])
# TODO: this should be ms unit, now that we can?
return ((data2['day'] - 2440588) * 86400000000000 +
return ((data2['day'] - np.int64(2440588)) * DAYS_TO_NANOS +
data2['ns']).view('M8[ns]')
if se.logicalType is not None and se.logicalType.TIMESTAMP is not None:
dt = _logical_to_time_dtype(se.logicalType.TIMESTAMP)
Expand Down Expand Up @@ -188,7 +189,7 @@ def convert(data, se, timestamp96=True, dtype=None):
for i in range(len(data))
])
elif ctype == parquet_thrift.ConvertedType.DATE:
data = data * DAYS_TO_MILLIS
data = data * DAYS_TO_NANOS
return data.view('datetime64[ns]')
elif ctype == parquet_thrift.ConvertedType.TIME_MILLIS:
# this was not covered by new pandas time units
Expand Down
2 changes: 1 addition & 1 deletion fastparquet/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def cat(col):
tz_to_dt_tz(timezones[str(col)]))
else:
index = Index(d)
views[col] = index.values
views[col] = d
else:
index = MultiIndex([[]], [[]])
# index = MultiIndex.from_arrays(indexes)
Expand Down
4 changes: 1 addition & 3 deletions fastparquet/test/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,9 +1022,7 @@ def test_no_string(tmpdir):
df["A"] = df["A"].astype(pd.StringDtype())

# set *all* values to NA
df["A"].iloc[0] = pd.NA
df["A"].iloc[1] = pd.NA
df["A"].iloc[2] = pd.NA
df.loc[:, "A"] = pd.NA
df.to_parquet(fn, engine="fastparquet")
df2 = pd.read_parquet(fn)
assert pd.isna(df2.A).all()
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[build-system]
requires = ["setuptools", "wheel", "Cython >= 0.29.23", "oldest-supported-numpy", "pytest-runner"]
requires = ["setuptools", "setuptools_scm", "Cython >= 0.29.23", "numpy>=2.0.0rc1"]
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
pandas>=1.5.0
numpy>=1.20.3
numpy
cramjam>=2.3
fsspec
packaging
7 changes: 0 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,6 @@ def fix_exts(sources):
'local_scheme': 'no-local-version',
'write_to': 'fastparquet/_version.py'
},
setup_requires=[
'setuptools>18.0',
'setuptools-scm>1.5.4',
'Cython',
'pytest-runner',
'oldest-supported-numpy'
],
description='Python support for Parquet file format',
author='Martin Durant',
author_email='[email protected]',
Expand Down

0 comments on commit bb00f37

Please sign in to comment.