From 5723adad7ad80c95ba8fcb55d40186d6a29edb74 Mon Sep 17 00:00:00 2001 From: Jeffrey Wong Date: Sat, 5 Jan 2019 12:33:09 -0600 Subject: [PATCH] ARROW-3731: MVP to read parquet in R library I am contributing to [Arrow 3731](https://issues.apache.org/jira/browse/ARROW-3731). This PR has the minimum functionality to read parquet files into an arrow::Table, which can then be converted to a tibble. Multiple parquet files can be read inside `lapply`, and then concatenated at the end. Steps to compile 1) Build arrow and parquet c++ projects 2) In R run `devtools::load_all()` What I could use help with: The biggest challenge for me is my lack of experience with pkg-config. The R library has a `configure` file which uses pkg-config to figure out what c++ libraries to link to. Currently, `configure` looks up the Arrow project and links to -larrow only. We need it to also link to -lparquet. I do not know how to modify pkg-config's metadata to let it know to link to both -larrow and -lparquet Author: Jeffrey Wong Author: Romain Francois Author: jeffwong-nflx Closes #3230 from jeffwong-nflx/master and squashes the following commits: c67fa3d36 Merge pull request #3 from jeffwong-nflx/cleanup 1df3026cb don't hard code -larrow and -lparquet 8ccaa5172 cleanup 75ba5c9ae add contributor 56adad2ae Merge pull request #2 from romainfrancois/3731/parquet-2 7d6e64df2 read_parquet() only reading one parquet file, and gains a `as_tibble` argument e936b4400 need parquet on travis too ff260c587 header was too commented, renamed to parquet.cpp 9e1897f80 styling etc ... 456c5d260 read parquet files 22d89dd23 hardcode -larrow and -lparquet --- .travis.yml | 2 ++ r/DESCRIPTION | 2 ++ r/NAMESPACE | 1 + r/R/RcppExports.R | 4 +++ r/R/parquet.R | 33 +++++++++++++++++++++++ r/README.Rmd | 2 +- r/README.md | 61 ++++++++++++------------------------------- r/configure | 4 +-- r/man/read_parquet.Rd | 21 +++++++++++++++ r/src/RcppExports.cpp | 12 +++++++++ r/src/parquet.cpp | 37 ++++++++++++++++++++++++++ 11 files changed, 131 insertions(+), 48 deletions(-) create mode 100644 r/R/parquet.R create mode 100644 r/man/read_parquet.Rd create mode 100644 r/src/parquet.cpp diff --git a/.travis.yml b/.travis.yml index f14f7e4785948..916ccf460ecf8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -326,6 +326,8 @@ matrix: language: r cache: packages latex: false + env: + - ARROW_TRAVIS_PARQUET=1 before_install: # Have to copy-paste this here because of how R's build steps work - eval `python $TRAVIS_BUILD_DIR/ci/detect-changes.py` diff --git a/r/DESCRIPTION b/r/DESCRIPTION index a2632973134b9..5303a877f9e26 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -4,6 +4,7 @@ Version: 0.11.0.9000 Authors@R: c( person("Romain", "François", email = "romain@rstudio.com", role = c("aut", "cre")), person("Javier", "Luraschi", email = "javier@rstudio.com", role = c("ctb")), + person("Jeffrey", "Wong", email = "jeffreyw@netflix.com", role = c("ctb")), person("Apache Arrow", email = "dev@arrow.apache.org", role = c("aut", "cph")) ) Description: R Integration to 'Apache' 'Arrow'. @@ -62,6 +63,7 @@ Collate: 'memory_pool.R' 'message.R' 'on_exit.R' + 'parquet.R' 'read_record_batch.R' 'read_table.R' 'reexports-bit64.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index 8846defbd8e65..f8f6384dce1f8 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -123,6 +123,7 @@ export(read_arrow) export(read_csv_arrow) export(read_feather) export(read_message) +export(read_parquet) export(read_record_batch) export(read_schema) export(read_table) diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R index 55b9ab33ebf98..c6fe8719f4e89 100644 --- a/r/R/RcppExports.R +++ b/r/R/RcppExports.R @@ -637,6 +637,10 @@ ipc___ReadMessage <- function(stream) { .Call(`_arrow_ipc___ReadMessage`, stream) } +read_parquet_file <- function(filename) { + .Call(`_arrow_read_parquet_file`, filename) +} + RecordBatch__num_columns <- function(x) { .Call(`_arrow_RecordBatch__num_columns`, x) } diff --git a/r/R/parquet.R b/r/R/parquet.R new file mode 100644 index 0000000000000..141da7bd04b2c --- /dev/null +++ b/r/R/parquet.R @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Read parquet file from disk +#' +#' @param file a file path +#' @param as_tibble should the [arrow::Table][arrow__Table] be converted to a tibble. +#' @param ... currently ignored +#' +#' @return a [arrow::Table][arrow__Table], or a data frame if `as_tibble` is `TRUE`. +#' +#' @export +read_parquet <- function(file, as_tibble = TRUE, ...) { + tab <- shared_ptr(`arrow::Table`, read_parquet_file(f)) + if (isTRUE(as_tibble)) { + tab <- as_tibble(tab) + } + tab +} diff --git a/r/README.Rmd b/r/README.Rmd index 2c51d01c0f00f..9f0f39fef5352 100644 --- a/r/README.Rmd +++ b/r/README.Rmd @@ -25,7 +25,7 @@ git clone https://github.com/apache/arrow.git cd arrow/cpp && mkdir release && cd release # It is important to statically link to boost libraries -cmake .. -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off +cmake .. -DARROW_PARQUET=ON -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off make install ``` diff --git a/r/README.md b/r/README.md index 868fdff0a06e0..987d0c24a185b 100644 --- a/r/README.md +++ b/r/README.md @@ -14,7 +14,7 @@ git clone https://github.com/apache/arrow.git cd arrow/cpp && mkdir release && cd release # It is important to statically link to boost libraries -cmake .. -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off +cmake .. -DARROW_PARQUET=ON -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off make install ``` @@ -38,48 +38,19 @@ tf <- tempfile() #> # A tibble: 10 x 2 #> x y #> -#> 1 1 -0.255 -#> 2 2 -0.162 -#> 3 3 -0.614 -#> 4 4 -0.322 -#> 5 5 0.0693 -#> 6 6 -0.920 -#> 7 7 -1.08 -#> 8 8 0.658 -#> 9 9 0.821 -#> 10 10 0.539 -arrow::write_arrow(tib, tf) - -# read it back with pyarrow -pa <- import("pyarrow") -as_tibble(pa$open_file(tf)$read_pandas()) -#> # A tibble: 10 x 2 -#> x y -#> -#> 1 1 -0.255 -#> 2 2 -0.162 -#> 3 3 -0.614 -#> 4 4 -0.322 -#> 5 5 0.0693 -#> 6 6 -0.920 -#> 7 7 -1.08 -#> 8 8 0.658 -#> 9 9 0.821 -#> 10 10 0.539 -``` - -## Development - -### Code style - -We use Google C++ style in our C++ code. Check for style errors with - -``` -./lint.sh -``` - -You can fix the style issues with - +#> 1 1 0.0855 +#> 2 2 -1.68 +#> 3 3 -0.0294 +#> 4 4 -0.124 +#> 5 5 0.0675 +#> 6 6 1.64 +#> 7 7 1.54 +#> 8 8 -0.0209 +#> 9 9 -0.982 +#> 10 10 0.349 +# arrow::write_arrow(tib, tf) + +# # read it back with pyarrow +# pa <- import("pyarrow") +# as_tibble(pa$open_file(tf)$read_pandas()) ``` -./lint.sh --fix -``` \ No newline at end of file diff --git a/r/configure b/r/configure index 28f6a73ac7ef5..c17fd4c2ef624 100755 --- a/r/configure +++ b/r/configure @@ -26,13 +26,13 @@ # R CMD INSTALL --configure-vars='INCLUDE_DIR=/.../include LIB_DIR=/.../lib' # Library settings -PKG_CONFIG_NAME="arrow" +PKG_CONFIG_NAME="arrow parquet" PKG_DEB_NAME="arrow" PKG_RPM_NAME="arrow" PKG_CSW_NAME="arrow" PKG_BREW_NAME="apache-arrow" PKG_TEST_HEADER="" -PKG_LIBS="-larrow" +PKG_LIBS="-larrow -lparquet" # Use pkg-config if available pkg-config --version >/dev/null 2>&1 diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd new file mode 100644 index 0000000000000..c29e18bca5baf --- /dev/null +++ b/r/man/read_parquet.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parquet.R +\name{read_parquet} +\alias{read_parquet} +\title{Read parquet file from disk} +\usage{ +read_parquet(file, as_tibble = TRUE, ...) +} +\arguments{ +\item{file}{a file path} + +\item{as_tibble}{should the \link[=arrow__Table]{arrow::Table} be converted to a tibble.} + +\item{...}{currently ignored} +} +\value{ +a \link[=arrow__Table]{arrow::Table}, or a data frame if \code{as_tibble} is \code{TRUE}. +} +\description{ +Read parquet file from disk +} diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp index c752afba1c258..1e8fed1867655 100644 --- a/r/src/RcppExports.cpp +++ b/r/src/RcppExports.cpp @@ -1779,6 +1779,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// read_parquet_file +std::shared_ptr read_parquet_file(std::string filename); +RcppExport SEXP _arrow_read_parquet_file(SEXP filenameSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string >::type filename(filenameSEXP); + rcpp_result_gen = Rcpp::wrap(read_parquet_file(filename)); + return rcpp_result_gen; +END_RCPP +} // RecordBatch__num_columns int RecordBatch__num_columns(const std::shared_ptr& x); RcppExport SEXP _arrow_RecordBatch__num_columns(SEXP xSEXP) { @@ -2369,6 +2380,7 @@ static const R_CallMethodDef CallEntries[] = { {"_arrow_ipc___MessageReader__Open", (DL_FUNC) &_arrow_ipc___MessageReader__Open, 1}, {"_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) &_arrow_ipc___MessageReader__ReadNextMessage, 1}, {"_arrow_ipc___ReadMessage", (DL_FUNC) &_arrow_ipc___ReadMessage, 1}, + {"_arrow_read_parquet_file", (DL_FUNC) &_arrow_read_parquet_file, 1}, {"_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, {"_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, {"_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, diff --git a/r/src/parquet.cpp b/r/src/parquet.cpp new file mode 100644 index 0000000000000..859bd4826e7c2 --- /dev/null +++ b/r/src/parquet.cpp @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +// [[Rcpp::export]] +std::shared_ptr read_parquet_file(std::string filename) { + std::shared_ptr infile; + PARQUET_THROW_NOT_OK( + arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool(), &infile)); + + std::unique_ptr reader; + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + std::shared_ptr table; + PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); + + return table; +}