From 9c71c10191c6692e239bd2dfcd5a62be02c1919f Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 11 Dec 2023 15:06:55 -0500 Subject: [PATCH] try for pandas CI (#910) --- fastparquet/parquet_thrift/__init__.py | 4 +- fastparquet/test/test_api.py | 2 +- fastparquet/test/test_output.py | 2 +- fastparquet/test/test_pd_optional_types.py | 80 ++++++++++------------ fastparquet/test/util.py | 17 +++++ 5 files changed, 58 insertions(+), 47 deletions(-) diff --git a/fastparquet/parquet_thrift/__init__.py b/fastparquet/parquet_thrift/__init__.py index 0aac534b..c71820ef 100644 --- a/fastparquet/parquet_thrift/__init__.py +++ b/fastparquet/parquet_thrift/__init__.py @@ -5,4 +5,6 @@ def __getattr__(name): # for compatability with coe that calls, e.g., parquet_thrift.RowGroup(...) from ..cencoding import ThriftObject - return partial(ThriftObject.from_fields, thrift_name=name) + if name[0].isupper(): + return partial(ThriftObject.from_fields, thrift_name=name) + raise AttributeError(name) diff --git a/fastparquet/test/test_api.py b/fastparquet/test/test_api.py index a5619b5c..8b3b2e9b 100644 --- a/fastparquet/test/test_api.py +++ b/fastparquet/test/test_api.py @@ -9,7 +9,7 @@ import fsspec import numpy as np import pandas as pd -from pandas._testing import makeMixedDataFrame +from .util import makeMixedDataFrame try: from pandas.tslib import Timestamp except ImportError: diff --git a/fastparquet/test/test_output.py b/fastparquet/test/test_output.py index 4cb8efcf..827603ed 100644 --- a/fastparquet/test/test_output.py +++ b/fastparquet/test/test_output.py @@ -8,7 +8,7 @@ from fastparquet import ParquetFile from fastparquet import write, parquet_thrift, update_file_custom_metadata from fastparquet import writer, encoding -from pandas._testing import makeMixedDataFrame +from .util import makeMixedDataFrame from pandas.testing import assert_frame_equal from pandas.api.types import CategoricalDtype import pytest diff --git a/fastparquet/test/test_pd_optional_types.py b/fastparquet/test/test_pd_optional_types.py index bb162354..efb85ac7 100644 --- a/fastparquet/test/test_pd_optional_types.py +++ b/fastparquet/test/test_pd_optional_types.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd from pandas.testing import assert_frame_equal +from pandas.core.arrays import IntegerArray import fastparquet as fp from .util import tempdir from fastparquet import write, parquet_thrift @@ -10,61 +11,51 @@ import numpy.random as random -EXPECTED_SERIES_INT8 = pd.Series(random.uniform(low=-128, high=127,size=100)).round() -EXPECTED_SERIES_INT16 = pd.Series(random.uniform(low=-32768, high=32767,size=100)).round() -EXPECTED_SERIES_INT32 = pd.Series(random.uniform(low=-2147483648, high=2147483647,size=100)).round() -EXPECTED_SERIES_INT64 = pd.Series(random.uniform(low=-9223372036854775808, high=9223372036854775807,size=100)).round() -EXPECTED_SERIES_UINT8 = pd.Series(random.uniform(low=0, high=255,size=100)).round() -EXPECTED_SERIES_UINT16 = pd.Series(random.uniform(low=0, high=65535,size=100)).round() -EXPECTED_SERIES_UINT32 = pd.Series(random.uniform(low=0, high=4294967295,size=100)).round() -EXPECTED_SERIES_UINT64 = pd.Series(random.uniform(low=0, high=18446744073709551615,size=100)).round() -EXPECTED_SERIES_BOOL = pd.Series(random.choice([False, True], 100)) -EXPECTED_SERIES_STRING = pd.Series(random.choice([ +EXPECTED_SERIES_INT8 = random.uniform(low=-128, high=127, size=100).round() +EXPECTED_SERIES_INT16 = random.uniform(low=-32768, high=32767, size=100).round() +EXPECTED_SERIES_INT32 = random.uniform(low=-2147483648, high=2147483647, size=100).round() +EXPECTED_SERIES_INT64 = random.uniform(low=-9223372036854775808, high=9223372036854775807, size=100).round() +EXPECTED_SERIES_UINT8 = random.uniform(low=0, high=255, size=100).round() +EXPECTED_SERIES_UINT16 = random.uniform(low=0, high=65535, size=100).round() +EXPECTED_SERIES_UINT32 = random.uniform(low=0, high=4294967295, size=100).round() +EXPECTED_SERIES_UINT64 = random.uniform(low=0, high=18446744073709551615, size=100).round() +EXPECTED_SERIES_BOOL = random.choice([False, True], 100) +EXPECTED_SERIES_STRING = random.choice([ 'You', 'are', 'my', 'fire', 'The', 'one', 'desire', 'Believe', 'when', 'I', 'say', 'I', 'want', 'it', 'that', 'way' - ], 100)) + ], 100) -EXPECTED_SERIES_INT8.loc[20:30] = np.nan -EXPECTED_SERIES_INT16.loc[20:30] = np.nan -EXPECTED_SERIES_INT32.loc[20:30] = np.nan -EXPECTED_SERIES_INT64.loc[20:30] = np.nan -EXPECTED_SERIES_UINT8.loc[20:30] = np.nan -EXPECTED_SERIES_UINT16.loc[20:30] = np.nan -EXPECTED_SERIES_UINT32.loc[20:30] = np.nan -EXPECTED_SERIES_UINT64.loc[20:30] = np.nan -EXPECTED_SERIES_BOOL.loc[20:30] = np.nan -EXPECTED_SERIES_STRING.loc[20:30] = np.nan +EXPECTED_SERIES_INT8[20:30] = np.nan +EXPECTED_SERIES_INT16[20:30] = np.nan +EXPECTED_SERIES_INT32[20:30] = np.nan +EXPECTED_SERIES_INT64[20:30] = np.nan +EXPECTED_SERIES_UINT8[20:30] = np.nan +EXPECTED_SERIES_UINT16[20:30] = np.nan +EXPECTED_SERIES_UINT32[20:30] = np.nan +EXPECTED_SERIES_UINT64[20:30] = np.nan +EXPECTED_SERIES_BOOL[20:30] = np.nan +EXPECTED_SERIES_STRING[20:30] = np.nan +mask = EXPECTED_SERIES_UINT64 > -1 TEST = pd.DataFrame({ - 'int8': EXPECTED_SERIES_INT8.astype('Int8'), - 'int16': EXPECTED_SERIES_INT16.astype('Int16'), - 'int32': EXPECTED_SERIES_INT32.astype('Int32'), - 'int64': EXPECTED_SERIES_INT64.astype('Int64'), - 'uint8': EXPECTED_SERIES_UINT8.astype('UInt8'), - 'uint16': EXPECTED_SERIES_UINT16.astype('UInt16'), - 'uint32': EXPECTED_SERIES_UINT32.astype('UInt32'), - 'uint64': EXPECTED_SERIES_UINT64.astype('UInt64'), - 'bool': EXPECTED_SERIES_BOOL.astype('boolean'), - 'string': EXPECTED_SERIES_STRING.astype('string') + 'int8': pd.Series(pd.array(EXPECTED_SERIES_INT8, dtype='Int8')), + 'int16': pd.Series(pd.array(EXPECTED_SERIES_INT16, dtype='Int16')), + 'int32': pd.Series(pd.array(EXPECTED_SERIES_INT32, dtype='Int32')), + 'int64': pd.Series(pd.array(EXPECTED_SERIES_INT64, dtype='Int64')), + 'uint8': pd.Series(pd.array(EXPECTED_SERIES_UINT8, dtype='UInt8')), + 'uint16': pd.Series(pd.array(EXPECTED_SERIES_UINT16, dtype='UInt16')), + 'uint32': pd.Series(pd.array(EXPECTED_SERIES_UINT32, dtype='UInt32')), + 'uint64': pd.Series(pd.array(EXPECTED_SERIES_UINT64, dtype='UInt64')), + 'bool': pd.Series(pd.array(EXPECTED_SERIES_BOOL, dtype='boolean')), + 'string': pd.Series(EXPECTED_SERIES_STRING, dtype='string') }) -EXPECTED = pd.DataFrame({ - 'int8': EXPECTED_SERIES_INT8.astype('float16'), - 'int16': EXPECTED_SERIES_INT16.astype('float32'), - 'int32': EXPECTED_SERIES_INT32.astype('float64'), - 'int64': EXPECTED_SERIES_INT64.astype('float64'), - 'uint8': EXPECTED_SERIES_UINT8.astype('float16'), - 'uint16': EXPECTED_SERIES_UINT16.astype('float32'), - 'uint32': EXPECTED_SERIES_UINT32.astype('float64'), - 'uint64': EXPECTED_SERIES_UINT64.astype('float64'), - 'bool': EXPECTED_SERIES_BOOL.astype('float16'), - 'string': EXPECTED_SERIES_STRING -}) +EXPECTED = TEST EXPECTED_PARQUET_TYPES = { @@ -80,7 +71,8 @@ 'string': 'BYTE_ARRAY' } -@pytest.mark.parametrize('comp', (None,'snappy', 'gzip')) + +@pytest.mark.parametrize('comp', (None, 'snappy', 'gzip')) @pytest.mark.parametrize('scheme', ('simple', 'hive')) def test_write_nullable_columns(tempdir, scheme, comp): fname = os.path.join(tempdir, 'test_write_nullable_columns.parquet') diff --git a/fastparquet/test/util.py b/fastparquet/test/util.py index e5705643..a82d4713 100644 --- a/fastparquet/test/util.py +++ b/fastparquet/test/util.py @@ -4,6 +4,8 @@ import tempfile import shutil +import pandas as pd + TEST_DATA = "test-data" port = 5555 @@ -76,3 +78,18 @@ def tempdir(): yield d if os.path.exists(d): shutil.rmtree(d, ignore_errors=True) + + + +def makeMixedDataFrame(): + index = pd.Index(["a", "b", "c", "d", "e"], name="index") + + data = { + "A": pd.Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float64"), + "B": pd.Series([0.0, 1.0, 0.0, 1.0, 0.0], dtype="float64"), + "C": pd.Series(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype='object'), + "D": pd.bdate_range("1/1/2009", periods=5), + } + return pd.DataFrame(data=data) + +