Merge branch 'branch-0.15'

BlazingDB · Aug 31, 2020 · 1932065 · 1932065
2 parents a6334f4 + 45049fe
commit 1932065
Show file tree

Hide file tree

Showing 337 changed files with 29,291 additions and 37,759 deletions.
diff --git a/.gitignore b/.gitignore
@@ -69,3 +69,25 @@ configurationFile.json
 *logs*
 
 # END gitignore from e2e repo
+
+thirdparty/aws-cpp
+thirdparty/rapids/
+thirdparty/cudf/
+
+.condarc
+.conda/
+core
+
+# stuff from our docker utils
+.cache/
+.config/
+.cupy/
+.jitify-cache/
+.nv/
+
+# powerpc
+powerpc/tmp/
+powerpc/blazingsql.tar.gz
+powerpc/developer/requirements.txt
+powerpc/developer/core
+powerpc/developer/blazingsql.tar.gz
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,9 +2,134 @@
 # Use this file to document any changes made during a PR. Every PR  #
 # should have an entry.                                             #
 #####################################################################
-<<<<<<< HEAD:CHANGELOG.md
-#322 Added the ability to run count distinct queries in a distruted fashion
-=======
-#391 Added the ability to run count distinct queries in a distruted fashion
-#392 Remove the unnecessary messages on distributed mode
->>>>>>> branch-0.13:changelog
+
+
+# BlazingSQL 0.15.0 (August 31, 2020)
+
+## New Features
+- #835 Added a memory monitor for better memory management and added pull ordered from cache
+- #889 Added Sphinx based code architecture documentation
+- #968 Support PowerPC architecture
+
+## Improvements
+- #777 Update Calcite to the most recent version 1.23
+- #786 Added check for concat String overflow
+- #815 Implemented Unordered pull from cache to help performance
+- #822 remove "from_cudf" code and cudf test utilities from engine code
+- #824 Added a test on Calcite to compare the logical plans when the ruleset is updated
+- #802 Support for timestampadd and constant expressions evaluation by Calcite
+- #849 Added check for CUDF_HOME to allow build to use an existing prebuilt cudf source tree
+- #829 Python/Cython check code style
+- #826 Support cross join
+- #866 Added nogil statements for pure C functions in Cython
+- #784 Updated set of TPCH queries on the E2E tests
+- #877 round robing dask workers on single gpu queries
+- #880 reraising query errors in context.py
+- #883 add rand() and running unary operations on literals
+- #894 added exhale to generate doxygen for sphinx docs
+- #887 concatenating cache improvement and replacing PartwiseJoin::load_set with a concatenating cache
+- #885 Added initial set of unit tests for `WaitingQueue` and nullptr checks around spdlog calls
+- #904 Added doxygen comments to CacheMachine.h
+- #901 Added more documentation about memory management
+- #910 updated readme
+- #915 Adding max kernel num threads pool
+- #921 Make AWS and GCS optional
+- #925 Replace random_generator with cudf::sample
+- #900 Added doxygen comments to some kernels and the batch processing
+- #936 Adding extern C for include files
+- #941 Logging level (flush_on) can be configurable
+- #947 Use default client and network interface from Dask
+- #945 Added new separate thresh for concat cache 
+- #939 Add unit test for Project kernel
+- #949 Implemented using threadpool for outgoing messages 
+- #961 Add list_tables() and describe_table() functions
+- #967 Add bc.get_free_memory() function
+
+## Bug Fixes
+- #774 fixed build issues with latest cudf 0.15 including updating from_cudf
+- #781 Fixed issue with Hive partitions when doing SELECT *
+- #754 Normalize columns before distribution in JoinPartitionKernel
+- #782 fixed issue with hive partitions base folder
+- #791 Fixes issues due to changes in rmm and fixes allocator issues
+- #770 Fix interops operators output types
+- #798 Fix when the algebra plan was provided using one-line as logical plan
+- #799 Fix uri values computacion in runQueryCaller
+- #792 Remove orc temp files when cached on Disk
+- #814 Fix when checking only Limit and Scan Kernels
+- #816 Loading one file at a time (LimitKernel and ScanKernel)
+- #832 updated calcite test reference
+- #834 Fixed small issue with hive and cudf_type_int_to_np_types
+- #839 Fixes literal cast
+- #838 Fixed issue with start and length of substring being different types
+- #823 Fixed issue on logical plans when there is an EXISTS clause
+- #845 Fixed issue with casting string to string
+- #850 Fixed issue with getTableScanInfoCaller
+- #851 Fix row_groups issue in ParquetParser.cpp
+- #847 Fixed issue with some constant expressions not evaluated by calcite
+- #875 Recovered some old unit tests and deleted obsolete unit tests
+- #879 Fixed issue with log directory creation in a distributed environment
+- #890 Fixed issue where we were including testing hpp in our code
+- #891 Fixed issue caused by replacing join load_set with concatenating cache
+- #902 Fixed optimization regression on the select count(*) case
+- #909 Fixed issue caused by using now arrow_io_source
+- #913 Fixed issues caused by cudf adding DECIMAL data type
+- #916 Fix e2e string comparison
+- #927 Fixed random segfault issue in parser
+- #929 Update the GPUManager functions
+- #942 Fix column names on sample function
+- #950 Introducing config param for max orderby samples and fixing oversampling 
+- #952 Dummy PR
+- #957 Fixed issues caused by changes to timespamp in cudf
+- #962 Use new rmm API instead of get_device_resource() and set_device_resource() functions
+- #965 Handle exceptions from pool_threads
+- #963 Set log_level when using LOGGING_LEVEL param
+- #973 Fix how we check the existence of the JAVA_HOME environment variable
+
+# BlazingSQL 0.14.0 (June 9, 2020)
+
+- #391 Added the ability to run count distinct queries in a distruted fashion
+- #392 Remove the unnecessary messages on distributed mode
+- #560 Fixed bug where parsing errors would lead to crash
+- #565 made us have same behaviour as cudf for reading csv
+- #612 Print product version: print(blazingsql.__version__) # shows the git hash
+- #638 Refactores and fixes SortAndSample kernels
+- #631 Implemented ability to send config_options to bc.sql function
+- #621 Clean dead code
+- #602 Implements cache flow control feature
+- #625 Implement CAST to TINYINT and SMALLINT
+- #632 Implement CHAR_LENGTH function
+- #635 Handle behavior when the optimized plan contains a LogicalValues
+- #653 Handle exceptions on python side
+- #661 added hive support to parse_batch
+- #662 updated from_cudf code and fixed other issue due to new cudf::list_view
+- #674 Allow to define and use a specific AWS S3 region
+- #677 added guava to pom.xml
+- #679 Support modern compilers (>= g++-7.x)
+- #649 Adding event logging
+- #660 Changed how we handle the partitions of a dask.cudf.DataFrame
+- #697 Update expression parser
+- #659 Improve reading for: SELECT * FROM table LIMIT N
+- #700 Support null column in projection
+- #711 Migrate end to end tests into blazingsql repo
+- #718 Changed all condition variable waits to wait_for
+- #712 fixed how we handle empty tables for estimate for small table join
+- #724 Removed unused BlazingThread creations
+- #725 Added nullptr check to num_rows()
+- #729 Fixed issue with num_rows() and wait_for
+- #728 Add replace_calcite_regex function to the join condition
+- #721 Handling multi-partition output
+- #750 Each table scan now has its own data loader
+- #740 Normalizing types for UNION ALL
+- #744 Fix unit tests
+- #743 Workaround for interops 64 index plan limitation
+- #763 Implemented ability to set the folder for all log files
+- #757 Ensure GPU portability (so we can run on any cloud instance with GPU)
+- #753 Fix for host memory threshold parameter with Dask envs
+- #801 Fix build with new cudf 0.15 and arrow 0.17.1
+- #809 Fix conda build issues
+- #828 Fix gpuci issues and improve tooling to debug gpuci related issues
+- #867 Fix boost dependencie issues
+- #785 Add script for Manual Testing Artifacts.
+- #931 Add script for error messages validation.
+- #932 Import pydrill and pyspark only when its generator or full mode.
+
diff --git a/README.md b/README.md
@@ -70,29 +70,42 @@ bc.sql('SELECT passenger_count, trip_distance FROM taxi LIMIT 2')
 ## Documentation
 You can find our full documentation at [docs.blazingdb.com](https://docs.blazingdb.com/docs).
 
+# Prerequisites 
+* [Anaconda or Miniconda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html) installed
+* OS Support
+  * Ubuntu 16.04/18.04 LTS
+  * CentOS 7
+* GPU Support
+  * Pascal or Better
+  * Compute Capability >= 6.0
+* CUDA Support
+  * 10.1.2
+  * 10.2
+* Python Support
+  * 3.7
+  * 3.8
+
 # Install Using Conda
 BlazingSQL can be installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the [blazingsql](https://anaconda.org/blazingsql/) channel:
 
-Note: BlazingSQL is supported only on Linux, and with Python version 3.6 and 3.7.
-
 ## Stable Version
 ```bash
-conda install -c blazingsql/label/cuda$CUDA_VERSION -c blazingsql -c rapidsai -c nvidia -c conda-forge -c defaults blazingsql python=$PYTHON_VERSION
+conda install -c blazingsql/label/cuda$CUDA_VERSION -c rapidsai -c nvidia -c conda-forge -c defaults blazingsql python=$PYTHON_VERSION
 ```
-Where $CUDA_VERSION is 10.0, 10.1 or 10.2 and $PYTHON_VERSION is 3.6 or 3.7
-*For example for CUDA 10.0 and Python 3.7:*
+Where $CUDA_VERSION is 10.1, 10.2 or 11.0  and $PYTHON_VERSION is 3.7 or 3.8
+*For example for CUDA 10.1 and Python 3.7:*
 ```bash
-conda install -c blazingsql/label/cuda10.0 -c blazingsql -c rapidsai -c nvidia -c conda-forge -c defaults blazingsql python=3.7
+conda install -c blazingsql/label/cuda10.1 -c rapidsai -c nvidia -c conda-forge -c defaults blazingsql python=3.7
 ``` 
 
 ## Nightly Version
 ```bash
-conda install -c blazingsql-nightly/label/cuda$CUDA_VERSION -c blazingsql-nightly -c rapidsai-nightly -c nvidia -c conda-forge -c defaults blazingsql python=$PYTHON_VERSION
+conda install -c blazingsql-nightly/label/cuda$CUDA_VERSION -c rapidsai-nightly -c nvidia -c conda-forge -c defaults blazingsql python=$PYTHON_VERSION
 ```
-Where $CUDA_VERSION is 10.0, 10.1 or 10.2 and $PYTHON_VERSION is 3.6 or 3.7
-*For example for CUDA 10.0 and Python 3.7:*
+Where $CUDA_VERSION is 10.1, 10.2 or 11.0 and $PYTHON_VERSION is 3.7 or 3.8
+*For example for CUDA 10.1 and Python 3.7:*
 ```bash
-conda install -c blazingsql-nightly/label/cuda10.0 -c blazingsql-nightly -c rapidsai-nightly -c nvidia -c conda-forge -c defaults blazingsql python=3.7
+conda install -c blazingsql-nightly/label/cuda10.1 -c rapidsai-nightly -c nvidia -c conda-forge -c defaults blazingsql python=3.7
 ```
 
 # Build/Install from Source (Conda Environment)
@@ -105,17 +118,15 @@ This is the recommended way of building all of the BlazingSQL components and dep
 conda create -n bsql python=$PYTHON_VERSION
 conda activate bsql
 conda install --yes -c conda-forge openjdk=8.0 maven cmake gtest gmock rapidjson cppzmq cython=0.29 jpype1 netifaces pyhive
-conda install --yes -c conda-forge -c blazingsql bsql-toolchain
-conda install --yes -c rapidsai -c nvidia -c conda-forge -c defaults cudf=0.14 dask-cudf=0.14 dask-cuda=0.14 cudatoolkit=$CUDA_VERSION
+conda install --yes -c rapidsai -c nvidia -c conda-forge -c defaults cudf=0.15 dask-cudf=0.15 dask-cuda=0.15 cudatoolkit=$CUDA_VERSION
 ```
-Where $CUDA_VERSION is 10.0, 10.1 or 10.2 and $PYTHON_VERSION is 3.6 or 3.7
-*For example for CUDA 10.0 and Python 3.7:*
+Where $CUDA_VERSION is is 10.1, 10.2 or 11.0 and $PYTHON_VERSION is 3.7 or 3.8
+*For example for CUDA 10.1 and Python 3.7:*
 ```bash
 conda create -n bsql python=3.7
 conda activate bsql
 conda install --yes -c conda-forge openjdk=8.0 maven cmake gtest gmock rapidjson cppzmq cython=0.29 jpype1 netifaces pyhive
-conda install --yes -c conda-forge -c blazingsql bsql-toolchain
-conda install --yes -c rapidsai -c nvidia -c conda-forge -c defaults cudf=0.14 dask-cudf=0.14 dask-cuda=0.14 cudatoolkit=10.0
+conda install --yes -c rapidsai -c nvidia -c conda-forge -c defaults cudf=0.15 dask-cudf=0.15 dask-cuda=0.15 cudatoolkit=10.1
 ```
 
 ### Build
@@ -139,18 +150,19 @@ $CONDA_PREFIX now has a folder for the blazingsql repository.
 ```bash
 conda create -n bsql python=$PYTHON_VERSION
 conda activate bsql
-conda install --yes -c conda-forge openjdk=8.0 maven cmake gtest gmock rapidjson cppzmq cython=0.29 jpype1 netifaces pyhive
-conda install --yes -c conda-forge -c blazingsql-nightly bsql-toolchain=0.15
-conda install --yes -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcudf=0.15 cudf=0.15 dask-cudf=0.15 dask-cuda=0.15 cudatoolkit=$CUDA_VERSION
+
+conda install --yes -c conda-forge google-cloud-cpp ninja
+conda install --yes -c rapidsai-nightly -c nvidia -c conda-forge -c defaults dask-cuda=0.16 dask-cudf=0.16 cudf=0.16 python=3.7 cudatoolkit=$CUDA_VERSION
+conda install --yes -c conda-forge cmake gtest gmock cppzmq cython=0.29 openjdk=8.0 maven thrift=0.13.0 jpype1 netifaces pyhive
 ```
-Where $CUDA_VERSION is 10.0, 10.1 or 10.2 and $PYTHON_VERSION is 3.6 or 3.7
-*For example for CUDA 10.0 and Python 3.7:*
+Where $CUDA_VERSION is is 10.1, 10.2 or 11.0 and $PYTHON_VERSION is 3.7 or 3.8
+*For example for CUDA 10.1 and Python 3.7:*
 ```bash
 conda create -n bsql python=3.7
 conda activate bsql
-conda install --yes -c conda-forge openjdk=8.0 maven cmake gtest gmock rapidjson cppzmq cython=0.29 jpype1 netifaces pyhive
-conda install --yes -c conda-forge -c blazingsql-nightly bsql-toolchain=0.15
-conda install --yes -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcudf=0.15 cudf=0.15 dask-cudf=0.15 dask-cuda=0.15 cudatoolkit=10.0
+conda install --yes -c conda-forge google-cloud-cpp ninja
+conda install --yes -c rapidsai-nightly -c nvidia -c conda-forge -c defaults dask-cuda=0.16 dask-cudf=0.16 cudf=0.16 python=3.7 cudatoolkit=10.1
+conda install --yes -c conda-forge cmake gtest gmock cppzmq cython=0.29 openjdk=8.0 maven thrift=0.13.0 jpype1 netifaces pyhive
 ```
 
 ### Build
@@ -167,6 +179,34 @@ NOTE: You can do `./build.sh -h` to see more build options.
 
 $CONDA_PREFIX now has a folder for the blazingsql repository.
 
+#### Storage plugins
+To build without the storage plugins (AWS S3, Google Cloud Storage) use the next arguments:
+```bash
+# Disable all storage plugins
+./build.sh disable-aws-s3 disable-google-gs
+
+# Disable AWS S3 storage plugin
+./build.sh disable-aws-s3
+
+# Disable Google Cloud Storage plugin
+./build.sh disable-google-gs
+```
+NOTE: By disabling the storage plugins you don't need to install previously AWS SDK C++ or Google Cloud Storage (neither any of its dependencies).
+
+# Documentation
+User guides and public APIs documentation can be found at [here](https://docs.blazingdb.com/docs)
+
+Our internal code architecture can be built using Spinx.
+```bash
+pip install recommonmark exhale
+conda install -c conda-forge doxygen
+cd $CONDA_PREFIX
+cd blazingsql/docs
+make html
+```
+The generated documentation can be viewed in a browser at `blazingsql/docs/_build/html/index.html`
+
+
 # Community
 ## Contributing
 Have questions or feedback? Post a [new github issue](https://github.com/blazingdb/blazingsql/issues/new/choose).

diff --git a/algebra/blazingdb-calcite-application/pom.xml b/algebra/blazingdb-calcite-application/pom.xml
@@ -71,6 +71,13 @@
 			<artifactId>commons-cli</artifactId>
 			<version>1.4</version>
 		</dependency>
+
+		<dependency>
+			<groupId>org.apache.commons</groupId>
+			<artifactId>commons-dbcp2</artifactId>
+			<version>2.6.0</version>
+		</dependency>
+
 	</dependencies>
 
 	<build>

diff --git a/...plication/src/main/java/com/blazingdb/calcite/application/RelationalAlgebraGenerator.java b/...plication/src/main/java/com/blazingdb/calcite/application/RelationalAlgebraGenerator.java
@@ -28,6 +28,8 @@
 import org.apache.calcite.rel.rules.ProjectMergeRule;
 import org.apache.calcite.rel.rules.ProjectRemoveRule;
 import org.apache.calcite.rel.rules.AggregateReduceFunctionsRule;
+import org.apache.calcite.rel.rules.ReduceExpressionsRule;
+import org.apache.calcite.rex.RexExecutorImpl;
 import org.apache.calcite.rel.type.RelDataTypeSystem;
 import org.apache.calcite.schema.SchemaPlus;
 import org.apache.calcite.sql.SqlNode;
@@ -182,11 +184,16 @@ public RelationalAlgebraGenerator(FrameworkConfig frameworkConfig, HepProgram he
 						  .addRuleInstance(FilterAggregateTransposeRule.INSTANCE)
 						  .addRuleInstance(FilterJoinRule.JoinConditionPushRule.FILTER_ON_JOIN)
 						  .addRuleInstance(FilterJoinRule.JoinConditionPushRule.JOIN)
+						  .addRuleInstance(ProjectMergeRule.INSTANCE)
 						  .addRuleInstance(FilterMergeRule.INSTANCE)
 						  .addRuleInstance(ProjectJoinTransposeRule.INSTANCE)
 						  .addRuleInstance(ProjectFilterTransposeRule.INSTANCE)
-						  .addRuleInstance(ProjectMergeRule.INSTANCE)
 						  .addRuleInstance(ProjectRemoveRule.INSTANCE)
+
+						  //The following three rules evaluate expressions in Projects and Filters
+						  .addRuleInstance(ReduceExpressionsRule.PROJECT_INSTANCE)
+						  .addRuleInstance(ReduceExpressionsRule.FILTER_INSTANCE)
+
 						  .addRuleInstance(ProjectTableScanRule.INSTANCE)
 						  .addRuleInstance(FilterTableScanRule.INSTANCE)
 						  .addRuleInstance(FilterRemoveIsNotDistinctFromRule.INSTANCE)
@@ -201,6 +208,7 @@ public RelationalAlgebraGenerator(FrameworkConfig frameworkConfig, HepProgram he
 		}
 
 		final HepPlanner hepPlanner = new HepPlanner(program, config.getContext());
+		nonOptimizedPlan.getCluster().getPlanner().setExecutor(new RexExecutorImpl(null));
 		hepPlanner.setRoot(nonOptimizedPlan);
 
 		planner.close();
@@ -240,11 +248,13 @@ public RelationalAlgebraGenerator(FrameworkConfig frameworkConfig, HepProgram he
 		}catch(SqlValidationException ex){
 			//System.out.println(ex.getMessage());
 			//System.out.println("Found validation err!");
-			return "fail: \n " + ex.getMessage();
+			throw ex;
+			//return "fail: \n " + ex.getMessage();
 		}catch(SqlSyntaxException ex){
 			//System.out.println(ex.getMessage());
 			//System.out.println("Found syntax err!");
-			return "fail: \n " + ex.getMessage();
+			throw ex;
+			//return "fail: \n " + ex.getMessage();
 		} catch(Exception ex) {
 			//System.out.println(ex.toString());
 			//System.out.println(ex.getMessage());

diff --git a/...e-application/src/main/java/com/blazingdb/calcite/application/SqlValidationException.java b/...e-application/src/main/java/com/blazingdb/calcite/application/SqlValidationException.java
@@ -17,8 +17,7 @@ public SqlValidationException(final String queryString, final ValidationExceptio
 
 	private static String
 	description(final String queryString, final String message) {
-
-		return message;
-
+		String[] a = message.split("(\\d+): ");
+		return a[a.length-1]; //TODO William, Felipe, Rommel, Percy  use a better regular expression / approach
 	}
 }