Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update pyarrow version, use pyarrow as default backend for pandas rea… #5

Merged
merged 2 commits into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cycquery/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ def run(
index_col: Optional[str] = None,
batch_mode: bool = False,
batch_size: int = 1000000,
dtype_backend: str = "pyarrow",
) -> Union[pd.DataFrame, Generator[pd.DataFrame, None, None]]:
"""Run the query, and fetch data.

Expand All @@ -194,6 +195,8 @@ def run(
Whether to run the query in batch mode. A generator is returned if True.
batch_size
Batch size for the query, default 1 million rows.
dtype_backend
Data type to use for the backend, default pyarrow.

Returns
-------
Expand All @@ -206,12 +209,14 @@ def run(
self.query,
limit=limit,
index_col=index_col,
dtype_backend=dtype_backend,
)
else:
self._data = self.database.run_query_batch(
self.query,
index_col=index_col,
batch_size=batch_size,
dtype_backend=dtype_backend,
)

return self._data
Expand Down
15 changes: 13 additions & 2 deletions cycquery/orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ def _setup(self) -> None:
def run_query(
self,
query: Union[TableTypes, str],
dtype_backend: str = "pyarrow",
limit: Optional[int] = None,
index_col: Optional[str] = None,
) -> pd.DataFrame:
Expand All @@ -205,6 +206,8 @@ def run_query(
----------
query
Query to run.
dtype_backend
Backend for dtype conversion.
limit
Limit query result to limit.
index_col
Expand All @@ -227,7 +230,12 @@ def run_query(

# Run the query and return the results.
with self.session.connection():
data = pd.read_sql_query(query, self.engine, index_col=index_col)
data = pd.read_sql_query(
query,
self.engine,
index_col=index_col,
dtype_backend=dtype_backend,
)
LOGGER.info("Query returned successfully!")

return data
Expand Down Expand Up @@ -386,6 +394,7 @@ def run_query_batch(
query: TableTypes,
index_col: str,
batch_size: int,
dtype_backend: str = "pyarrow",
) -> Generator[pd.DataFrame, None, None]:
"""Generate query batches with complete sets of IDs in a batch.

Expand All @@ -402,6 +411,8 @@ def run_query_batch(
Batch size for the query. Since the partitioning happens on the index
column, the batch size is the approximate number of rows that will
be returned in a batch.
dtype_backend
Backend for dtype conversion.

Yields
------
Expand All @@ -420,4 +431,4 @@ def run_query_batch(
# Opportunity for easy multi-processing/parallelization here!
for condition in conditions:
run = (sess_query.filter(condition)).subquery()
yield pd.read_sql_query(run, self.engine)
yield pd.read_sql_query(run, self.engine, dtype_backend=dtype_backend)
Loading