VectorInstitute · amrit110 · Nov 24, 2023 · Nov 24, 2023 · Nov 24, 2023
diff --git a/cycquery/interface.py b/cycquery/interface.py
@@ -178,6 +178,7 @@ def run(
         index_col: Optional[str] = None,
         batch_mode: bool = False,
         batch_size: int = 1000000,
+        dtype_backend: str = "pyarrow",
     ) -> Union[pd.DataFrame, Generator[pd.DataFrame, None, None]]:
         """Run the query, and fetch data.
 
@@ -194,6 +195,8 @@ def run(
             Whether to run the query in batch mode. A generator is returned if True.
         batch_size
             Batch size for the query, default 1 million rows.
+        dtype_backend
+            Data type to use for the backend, default pyarrow.
 
         Returns
         -------
@@ -206,12 +209,14 @@ def run(
                 self.query,
                 limit=limit,
                 index_col=index_col,
+                dtype_backend=dtype_backend,
             )
         else:
             self._data = self.database.run_query_batch(
                 self.query,
                 index_col=index_col,
                 batch_size=batch_size,
+                dtype_backend=dtype_backend,
             )
 
         return self._data

diff --git a/cycquery/orm.py b/cycquery/orm.py
@@ -196,6 +196,7 @@ def _setup(self) -> None:
     def run_query(
         self,
         query: Union[TableTypes, str],
+        dtype_backend: str = "pyarrow",
         limit: Optional[int] = None,
         index_col: Optional[str] = None,
     ) -> pd.DataFrame:
@@ -205,6 +206,8 @@ def run_query(
         ----------
         query
             Query to run.
+        dtype_backend
+            Backend for dtype conversion.
         limit
             Limit query result to limit.
         index_col
@@ -227,7 +230,12 @@ def run_query(
 
         # Run the query and return the results.
         with self.session.connection():
-            data = pd.read_sql_query(query, self.engine, index_col=index_col)
+            data = pd.read_sql_query(
+                query,
+                self.engine,
+                index_col=index_col,
+                dtype_backend=dtype_backend,
+            )
         LOGGER.info("Query returned successfully!")
 
         return data
@@ -386,6 +394,7 @@ def run_query_batch(
         query: TableTypes,
         index_col: str,
         batch_size: int,
+        dtype_backend: str = "pyarrow",
     ) -> Generator[pd.DataFrame, None, None]:
         """Generate query batches with complete sets of IDs in a batch.
 
@@ -402,6 +411,8 @@ def run_query_batch(
             Batch size for the query. Since the partitioning happens on the index
             column, the batch size is the approximate number of rows that will
             be returned in a batch.
+        dtype_backend
+            Backend for dtype conversion.
 
         Yields
         ------
@@ -420,4 +431,4 @@ def run_query_batch(
         # Opportunity for easy multi-processing/parallelization here!
         for condition in conditions:
             run = (sess_query.filter(condition)).subquery()
-            yield pd.read_sql_query(run, self.engine)
+            yield pd.read_sql_query(run, self.engine, dtype_backend=dtype_backend)