ibis-project · jitingxu1 · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 26, 2024
diff --git a/ibis/backends/__init__.py b/ibis/backends/__init__.py
@@ -4,6 +4,7 @@
 import collections.abc
 import contextlib
 import functools
+import glob
 import importlib.metadata
 import keyword
 import re
@@ -1269,6 +1270,108 @@
             f"{cls.name} backend has not implemented `has_operation` API"
         )
 
+    @util.experimental
+    def read_csv(
+        self, path: str | Path, table_name: str | None = None, **kwargs: Any
+    ) -> ir.Table:
+        """Register a CSV file as a table in the current backend.
+
+        This function reads a CSV file and registers it as a table in the current
+        backend. Note that for Impala and Trino backends, CSV read performance
+        may be suboptimal.
+
+        Parameters
+        ----------
+        path
+            The data source. A string or Path to the CSV file.
+        table_name
+            An optional name to use for the created table. This defaults to
+            a sequentially generated name.
+        **kwargs
+            Additional keyword arguments passed to the backend loading function.
+
+        Returns
+        -------
+        ir.Table
+            The just-registered table
+
+        Examples
+        --------
+        Connect to a SQLite database:
+
+        >>> con = ibis.sqlite.connect()
+
+        Read a single csv file:
+
+        >>> table = con.read_csv("path/to/file.csv")
+
-        Read a single csv file:
-
-        >>> table = con.read_csv("path/to/file.csv")
+        Read a single csv file:
+
+        >>> table = con.read_csv("path/to/file.csv")
+        
+        Read a single csv file, skipping the first row, with a custom delimiter:
+
+        >>> table = con.read_csv("path/to/file.csv", skip_rows=1, delimiter=";")
+
+        Read a single csv file, but only load the specified columns:
+
+        >>> table = con.read_csv("path/to/file.csv", include_columns=["species", "island"])
+
-        Read a single csv file:
-
-        >>> table = con.read_csv("path/to/file.csv")
+        Read a single csv file:
+
+        >>> table = con.read_csv("path/to/file.csv")
+        
+        Read a single csv file, skipping the first row, with a custom delimiter:
+
+        >>> table = con.read_csv("path/to/file.csv", skip_rows=1, delimiter=";")
+
+        Read a single csv file, but only load the specified columns:
+
+        >>> table = con.read_csv("path/to/file.csv", include_columns=["species", "island"])
+
+        Read all csv files in a directory:
+
+        >>> table = con.read_parquet("path/to/csv_directory/*")
+
+        Read all csv files with a glob pattern:
+
+        >>> table = con.read_csv("path/to/csv_directory/test_*.csv")
+
+        Read csv file from s3:
+
+        >>> table = con.read_csv("s3://bucket/path/to/file.csv")
+
+        """
+        pa = self._import_pyarrow()
+        import pyarrow.csv as pcsv
+        from pyarrow import fs
+
+        read_options_args = {}
+        parse_options_args = {}
+        convert_options_args = {}
+        memory_pool = None
+
+        for key, value in kwargs.items():
+            if hasattr(pcsv.ReadOptions, key):
+                read_options_args[key] = value
+            elif hasattr(pcsv.ParseOptions, key):
+                parse_options_args[key] = value
+            elif hasattr(pcsv.ConvertOptions, key):
+                convert_options_args[key] = value
+            elif key == "memory_pool":
+                memory_pool = value
+            else:
+                raise ValueError(f"Invalid args: {key!r}")
+
+        read_options = pcsv.ReadOptions(**read_options_args)
+        parse_options = pcsv.ParseOptions(**parse_options_args)
+        convert_options = pcsv.ConvertOptions(**convert_options_args)
+        if memory_pool:
+            memory_pool = pa.default_memory_pool()
+
+        path = str(path)
+        file_system, path = fs.FileSystem.from_uri(path)
+
+        if isinstance(file_system, fs.LocalFileSystem):
+            paths = glob.glob(path)
+            if not paths:
+                raise FileNotFoundError(f"No files found at {path!r}")
+        else:
+            paths = [path]
+
+        pyarrow_tables = []
+        for path in paths:
+            with file_system.open_input_file(path) as f:
+                pyarrow_table = pcsv.read_csv(
+                    f,
+                    read_options=read_options,
+                    parse_options=parse_options,
+                    convert_options=convert_options,
+                    memory_pool=memory_pool,
+                )
+                pyarrow_tables.append(pyarrow_table)
+
+        pyarrow_table = pa.concat_tables(pyarrow_tables)
+        table_name = table_name or util.gen_name("read_csv")
+        self.create_table(table_name, pyarrow_table)
+        return self.table(table_name)
+
     def _transpile_sql(self, query: str, *, dialect: str | None = None) -> str:
         # only transpile if dialect was passed
         if dialect is None:

diff --git a/ibis/backends/tests/test_register.py b/ibis/backends/tests/test_register.py
@@ -12,14 +12,13 @@
 
 import ibis
 from ibis.backends.conftest import TEST_TABLES
+from ibis.backends.tests.errors import MySQLOperationalError, PyODBCProgrammingError
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
 
     import pyarrow as pa
 
-pytestmark = pytest.mark.notimpl(["druid", "exasol", "oracle"])
-
 
 @contextlib.contextmanager
 def pushd(new_dir):
@@ -96,6 +95,7 @@ def gzip_csv(data_dir, tmp_path):
         "trino",
     ]
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_register_csv(con, data_dir, fname, in_table_name, out_table_name):
     with pushd(data_dir / "csv"):
         with pytest.warns(FutureWarning, match="v9.1"):
@@ -107,7 +107,7 @@ def test_register_csv(con, data_dir, fname, in_table_name, out_table_name):
 
 
 # TODO: rewrite or delete test when register api is removed
-@pytest.mark.notimpl(["datafusion"])
+@pytest.mark.notimpl(["datafusion", "druid", "exasol", "oracle"])
 @pytest.mark.notyet(
     [
         "bigquery",
@@ -147,6 +147,7 @@ def test_register_csv_gz(con, data_dir, gzip_csv):
         "trino",
     ]
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_register_with_dotted_name(con, data_dir, tmp_path):
     basename = "foo.bar.baz/diamonds.csv"
     f = tmp_path.joinpath(basename)
@@ -204,6 +205,7 @@ def read_table(path: Path) -> Iterator[tuple[str, pa.Table]]:
         "trino",
     ]
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_register_parquet(
     con, tmp_path, data_dir, fname, in_table_name, out_table_name
 ):
@@ -242,6 +244,7 @@ def test_register_parquet(
         "trino",
     ]
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_register_iterator_parquet(
     con,
     tmp_path,
@@ -270,7 +273,7 @@ def test_register_iterator_parquet(
 # TODO: remove entirely when `register` is removed
 # This same functionality is implemented across all backends
 # via `create_table` and tested in `test_client.py`
-@pytest.mark.notimpl(["datafusion"])
+@pytest.mark.notimpl(["datafusion", "druid", "exasol", "oracle"])
 @pytest.mark.notyet(
     [
         "bigquery",
@@ -304,7 +307,7 @@ def test_register_pandas(con):
 # TODO: remove entirely when `register` is removed
 # This same functionality is implemented across all backends
 # via `create_table` and tested in `test_client.py`
-@pytest.mark.notimpl(["datafusion", "polars"])
+@pytest.mark.notimpl(["datafusion", "polars", "druid", "exasol", "oracle"])
 @pytest.mark.notyet(
     [
         "bigquery",
@@ -345,6 +348,7 @@ def test_register_pyarrow_tables(con):
         "trino",
     ]
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_csv_reregister_schema(con, tmp_path):
     foo = tmp_path.joinpath("foo.csv")
     with foo.open("w", newline="") as csvfile:
@@ -373,10 +377,13 @@ def test_csv_reregister_schema(con, tmp_path):
         "bigquery",
         "clickhouse",
         "datafusion",
+        "druid",
+        "exasol",
         "flink",
         "impala",
         "mysql",
         "mssql",
+        "oracle",
         "polars",
         "postgres",
         "risingwave",
@@ -410,6 +417,7 @@ def test_register_garbage(con, monkeypatch):
 @pytest.mark.notyet(
     ["flink", "impala", "mssql", "mysql", "postgres", "risingwave", "sqlite", "trino"]
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_read_parquet(con, tmp_path, data_dir, fname, in_table_name):
     pq = pytest.importorskip("pyarrow.parquet")
 
@@ -450,6 +458,7 @@ def ft_data(data_dir):
         "trino",
     ]
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_read_parquet_glob(con, tmp_path, ft_data):
     pq = pytest.importorskip("pyarrow.parquet")
 
@@ -466,18 +475,10 @@ def test_read_parquet_glob(con, tmp_path, ft_data):
     assert table.count().execute() == nrows * ntables
 
 
-@pytest.mark.notyet(
-    [
-        "flink",
-        "impala",
-        "mssql",
-        "mysql",
-        "postgres",
-        "risingwave",
-        "sqlite",
-        "trino",
-    ]
-)
+@pytest.mark.notyet(["flink"])
+@pytest.mark.notimpl(["druid"])
+@pytest.mark.notimpl(["mssql"], raises=PyODBCProgrammingError)
+@pytest.mark.notimpl(["mysql"], raises=MySQLOperationalError)
 def test_read_csv_glob(con, tmp_path, ft_data):
     pc = pytest.importorskip("pyarrow.csv")
 
@@ -512,6 +513,7 @@ def test_read_csv_glob(con, tmp_path, ft_data):
     raises=ValueError,
     reason="read_json() missing required argument: 'schema'",
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_read_json_glob(con, tmp_path, ft_data):
     nrows = len(ft_data)
     ntables = 2
@@ -555,14 +557,26 @@ def num_diamonds(data_dir):
     "in_table_name",
     [param(None, id="default"), param("fancy_stones", id="file_name")],
 )
-@pytest.mark.notyet(
-    ["flink", "impala", "mssql", "mysql", "postgres", "risingwave", "sqlite", "trino"]
-)
+@pytest.mark.notyet(["flink"])
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_read_csv(con, data_dir, in_table_name, num_diamonds):
+    if con.name in ("trino", "impala"):
+        # TODO: remove after trino and impala have efficient insertion
+        pytest.skip(
+            "Both Impala and Trino lack efficient data insertion methods from Python."
+        )
     fname = "diamonds.csv"
     with pushd(data_dir / "csv"):
-        if con.name == "pyspark":
-            # pyspark doesn't respect CWD
+        if con.name in (
+            "pyspark",
+            "sqlite",
+            "mysql",
+            "postgres",
+            "risingwave",
+            "mssql",
+        ):
+            # pyspark backend doesn't respect CWD
+            # backends using pyarrow implementation need absolute path
             fname = str(Path(fname).absolute())
         table = con.read_csv(fname, table_name=in_table_name)