diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index cb9c904121..17dde7021b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -112,12 +112,15 @@ def __init__( *, session: typing.Optional[bigframes.session.Session] = None, ): + global bigframes + if copy is not None and not copy: raise ValueError( f"DataFrame constructor only supports copy=True. {constants.FEEDBACK_LINK}" ) - # just ignore object dtype if provided - if dtype in {numpy.dtypes.ObjectDType, "object"}: + # Ignore object dtype if provided, as it provides no additional + # information about what BigQuery type to use. + if dtype is not None and bigframes.dtypes.is_object_like(dtype): dtype = None # Check to see if constructing from BigQuery-backed objects before diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 563904fbb6..45c1e7e4e2 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -205,67 +205,74 @@ class SimpleDtypeInfo: ## dtype predicates - use these to maintain consistency -def is_datetime_like(type: ExpressionType) -> bool: - return type in (DATETIME_DTYPE, TIMESTAMP_DTYPE) +def is_datetime_like(type_: ExpressionType) -> bool: + return type_ in (DATETIME_DTYPE, TIMESTAMP_DTYPE) -def is_date_like(type: ExpressionType) -> bool: - return type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, DATE_DTYPE) +def is_date_like(type_: ExpressionType) -> bool: + return type_ in (DATETIME_DTYPE, TIMESTAMP_DTYPE, DATE_DTYPE) -def is_time_like(type: ExpressionType) -> bool: - return type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE) +def is_time_like(type_: ExpressionType) -> bool: + return type_ in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE) -def is_binary_like(type: ExpressionType) -> bool: - return type in (BOOL_DTYPE, BYTES_DTYPE, INT_DTYPE) +def is_binary_like(type_: ExpressionType) -> bool: + return type_ in (BOOL_DTYPE, BYTES_DTYPE, INT_DTYPE) -def is_string_like(type: ExpressionType) -> bool: - return type in (STRING_DTYPE, BYTES_DTYPE) +def is_object_like(type_: Union[ExpressionType, str]) -> bool: + # See: https://stackoverflow.com/a/40312924/101923 and + # https://numpy.org/doc/stable/reference/generated/numpy.dtype.kind.html + # for the way to identify object type. + return type_ in ("object", "O") or getattr(type_, "kind", None) == "O" -def is_array_like(type: ExpressionType) -> bool: - return isinstance(type, pd.ArrowDtype) and isinstance( - type.pyarrow_dtype, pa.ListType +def is_string_like(type_: ExpressionType) -> bool: + return type_ in (STRING_DTYPE, BYTES_DTYPE) + + +def is_array_like(type_: ExpressionType) -> bool: + return isinstance(type_, pd.ArrowDtype) and isinstance( + type_.pyarrow_dtype, pa.ListType ) -def is_array_string_like(type: ExpressionType) -> bool: +def is_array_string_like(type_: ExpressionType) -> bool: return ( - isinstance(type, pd.ArrowDtype) - and isinstance(type.pyarrow_dtype, pa.ListType) - and pa.types.is_string(type.pyarrow_dtype.value_type) + isinstance(type_, pd.ArrowDtype) + and isinstance(type_.pyarrow_dtype, pa.ListType) + and pa.types.is_string(type_.pyarrow_dtype.value_type) ) -def is_struct_like(type: ExpressionType) -> bool: - return isinstance(type, pd.ArrowDtype) and isinstance( - type.pyarrow_dtype, pa.StructType +def is_struct_like(type_: ExpressionType) -> bool: + return isinstance(type_, pd.ArrowDtype) and isinstance( + type_.pyarrow_dtype, pa.StructType ) -def is_json_like(type: ExpressionType) -> bool: +def is_json_like(type_: ExpressionType) -> bool: # TODO: Add JSON type support - return type == STRING_DTYPE + return type_ == STRING_DTYPE -def is_json_encoding_type(type: ExpressionType) -> bool: +def is_json_encoding_type(type_: ExpressionType) -> bool: # Types can be converted into JSON. # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_encodings - return type != GEO_DTYPE + return type_ != GEO_DTYPE -def is_numeric(type: ExpressionType) -> bool: - return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE +def is_numeric(type_: ExpressionType) -> bool: + return type_ in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE -def is_iterable(type: ExpressionType) -> bool: - return type in (STRING_DTYPE, BYTES_DTYPE) or is_array_like(type) +def is_iterable(type_: ExpressionType) -> bool: + return type_ in (STRING_DTYPE, BYTES_DTYPE) or is_array_like(type_) -def is_comparable(type: ExpressionType) -> bool: - return (type is not None) and is_orderable(type) +def is_comparable(type_: ExpressionType) -> bool: + return (type_ is not None) and is_orderable(type_) _ORDERABLE_SIMPLE_TYPES = set( @@ -273,9 +280,9 @@ def is_comparable(type: ExpressionType) -> bool: ) -def is_orderable(type: ExpressionType) -> bool: +def is_orderable(type_: ExpressionType) -> bool: # On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable - return type in _ORDERABLE_SIMPLE_TYPES + return type_ in _ORDERABLE_SIMPLE_TYPES _CLUSTERABLE_SIMPLE_TYPES = set( @@ -283,15 +290,15 @@ def is_orderable(type: ExpressionType) -> bool: ) -def is_clusterable(type: ExpressionType) -> bool: +def is_clusterable(type_: ExpressionType) -> bool: # https://cloud.google.com/bigquery/docs/clustered-tables#cluster_column_types # This is based on default database type mapping, could in theory represent in non-default bq type to cluster. - return type in _CLUSTERABLE_SIMPLE_TYPES + return type_ in _CLUSTERABLE_SIMPLE_TYPES -def is_bool_coercable(type: ExpressionType) -> bool: +def is_bool_coercable(type_: ExpressionType) -> bool: # TODO: Implement more bool coercions - return (type is None) or is_numeric(type) or is_string_like(type) + return (type_ is None) or is_numeric(type_) or is_string_like(type_) BIGFRAMES_STRING_TO_BIGFRAMES: Dict[DtypeString, Dtype] = { diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 1daa1ea5ae..2f87045415 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -18,7 +18,6 @@ from typing import List, Sequence import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing -import numpy import pandas as pd import bigframes.constants as constants @@ -49,8 +48,9 @@ def __init__( ): import bigframes.pandas - # just ignore object dtype if provided - if dtype in {numpy.dtypes.ObjectDType, "object"}: + # Ignore object dtype if provided, as it provides no additional + # information about what BigQuery type to use. + if dtype is not None and bigframes.dtypes.is_object_like(dtype): dtype = None read_pandas_func = ( diff --git a/setup.py b/setup.py index 79baf1fb23..0e0be5fd77 100644 --- a/setup.py +++ b/setup.py @@ -49,6 +49,7 @@ "google-cloud-storage >=2.0.0", "ibis-framework[bigquery] >=8.0.0,<9.0.0dev", "jellyfish >=0.8.9", + "numpy >=1.24.0", # TODO: Relax upper bound once we have fixed `system_prerelease` tests. "pandas >=1.5.0", "pyarrow >=8.0.0", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 5a76698576..0d3f16e95f 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -14,6 +14,7 @@ google-cloud-resource-manager==1.10.3 google-cloud-storage==2.0.0 ibis-framework==8.0.0 jellyfish==0.8.9 +numpy==1.24.0 pandas==1.5.0 pyarrow==8.0.0 pydata-google-auth==1.8.2