Skip to content

Commit

Permalink
Merge branch 'branch-0.15'
Browse files Browse the repository at this point in the history
  • Loading branch information
William Malpica committed Aug 31, 2020
2 parents a6334f4 + 45049fe commit 1932065
Show file tree
Hide file tree
Showing 337 changed files with 29,291 additions and 37,759 deletions.
22 changes: 22 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,25 @@ configurationFile.json
*logs*

# END gitignore from e2e repo

thirdparty/aws-cpp
thirdparty/rapids/
thirdparty/cudf/

.condarc
.conda/
core

# stuff from our docker utils
.cache/
.config/
.cupy/
.jitify-cache/
.nv/

# powerpc
powerpc/tmp/
powerpc/blazingsql.tar.gz
powerpc/developer/requirements.txt
powerpc/developer/core
powerpc/developer/blazingsql.tar.gz
137 changes: 131 additions & 6 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,134 @@
# Use this file to document any changes made during a PR. Every PR #
# should have an entry. #
#####################################################################
<<<<<<< HEAD:CHANGELOG.md
#322 Added the ability to run count distinct queries in a distruted fashion
=======
#391 Added the ability to run count distinct queries in a distruted fashion
#392 Remove the unnecessary messages on distributed mode
>>>>>>> branch-0.13:changelog


# BlazingSQL 0.15.0 (August 31, 2020)

## New Features
- #835 Added a memory monitor for better memory management and added pull ordered from cache
- #889 Added Sphinx based code architecture documentation
- #968 Support PowerPC architecture

## Improvements
- #777 Update Calcite to the most recent version 1.23
- #786 Added check for concat String overflow
- #815 Implemented Unordered pull from cache to help performance
- #822 remove "from_cudf" code and cudf test utilities from engine code
- #824 Added a test on Calcite to compare the logical plans when the ruleset is updated
- #802 Support for timestampadd and constant expressions evaluation by Calcite
- #849 Added check for CUDF_HOME to allow build to use an existing prebuilt cudf source tree
- #829 Python/Cython check code style
- #826 Support cross join
- #866 Added nogil statements for pure C functions in Cython
- #784 Updated set of TPCH queries on the E2E tests
- #877 round robing dask workers on single gpu queries
- #880 reraising query errors in context.py
- #883 add rand() and running unary operations on literals
- #894 added exhale to generate doxygen for sphinx docs
- #887 concatenating cache improvement and replacing PartwiseJoin::load_set with a concatenating cache
- #885 Added initial set of unit tests for `WaitingQueue` and nullptr checks around spdlog calls
- #904 Added doxygen comments to CacheMachine.h
- #901 Added more documentation about memory management
- #910 updated readme
- #915 Adding max kernel num threads pool
- #921 Make AWS and GCS optional
- #925 Replace random_generator with cudf::sample
- #900 Added doxygen comments to some kernels and the batch processing
- #936 Adding extern C for include files
- #941 Logging level (flush_on) can be configurable
- #947 Use default client and network interface from Dask
- #945 Added new separate thresh for concat cache
- #939 Add unit test for Project kernel
- #949 Implemented using threadpool for outgoing messages
- #961 Add list_tables() and describe_table() functions
- #967 Add bc.get_free_memory() function

## Bug Fixes
- #774 fixed build issues with latest cudf 0.15 including updating from_cudf
- #781 Fixed issue with Hive partitions when doing SELECT *
- #754 Normalize columns before distribution in JoinPartitionKernel
- #782 fixed issue with hive partitions base folder
- #791 Fixes issues due to changes in rmm and fixes allocator issues
- #770 Fix interops operators output types
- #798 Fix when the algebra plan was provided using one-line as logical plan
- #799 Fix uri values computacion in runQueryCaller
- #792 Remove orc temp files when cached on Disk
- #814 Fix when checking only Limit and Scan Kernels
- #816 Loading one file at a time (LimitKernel and ScanKernel)
- #832 updated calcite test reference
- #834 Fixed small issue with hive and cudf_type_int_to_np_types
- #839 Fixes literal cast
- #838 Fixed issue with start and length of substring being different types
- #823 Fixed issue on logical plans when there is an EXISTS clause
- #845 Fixed issue with casting string to string
- #850 Fixed issue with getTableScanInfoCaller
- #851 Fix row_groups issue in ParquetParser.cpp
- #847 Fixed issue with some constant expressions not evaluated by calcite
- #875 Recovered some old unit tests and deleted obsolete unit tests
- #879 Fixed issue with log directory creation in a distributed environment
- #890 Fixed issue where we were including testing hpp in our code
- #891 Fixed issue caused by replacing join load_set with concatenating cache
- #902 Fixed optimization regression on the select count(*) case
- #909 Fixed issue caused by using now arrow_io_source
- #913 Fixed issues caused by cudf adding DECIMAL data type
- #916 Fix e2e string comparison
- #927 Fixed random segfault issue in parser
- #929 Update the GPUManager functions
- #942 Fix column names on sample function
- #950 Introducing config param for max orderby samples and fixing oversampling
- #952 Dummy PR
- #957 Fixed issues caused by changes to timespamp in cudf
- #962 Use new rmm API instead of get_device_resource() and set_device_resource() functions
- #965 Handle exceptions from pool_threads
- #963 Set log_level when using LOGGING_LEVEL param
- #973 Fix how we check the existence of the JAVA_HOME environment variable

# BlazingSQL 0.14.0 (June 9, 2020)

- #391 Added the ability to run count distinct queries in a distruted fashion
- #392 Remove the unnecessary messages on distributed mode
- #560 Fixed bug where parsing errors would lead to crash
- #565 made us have same behaviour as cudf for reading csv
- #612 Print product version: print(blazingsql.__version__) # shows the git hash
- #638 Refactores and fixes SortAndSample kernels
- #631 Implemented ability to send config_options to bc.sql function
- #621 Clean dead code
- #602 Implements cache flow control feature
- #625 Implement CAST to TINYINT and SMALLINT
- #632 Implement CHAR_LENGTH function
- #635 Handle behavior when the optimized plan contains a LogicalValues
- #653 Handle exceptions on python side
- #661 added hive support to parse_batch
- #662 updated from_cudf code and fixed other issue due to new cudf::list_view
- #674 Allow to define and use a specific AWS S3 region
- #677 added guava to pom.xml
- #679 Support modern compilers (>= g++-7.x)
- #649 Adding event logging
- #660 Changed how we handle the partitions of a dask.cudf.DataFrame
- #697 Update expression parser
- #659 Improve reading for: SELECT * FROM table LIMIT N
- #700 Support null column in projection
- #711 Migrate end to end tests into blazingsql repo
- #718 Changed all condition variable waits to wait_for
- #712 fixed how we handle empty tables for estimate for small table join
- #724 Removed unused BlazingThread creations
- #725 Added nullptr check to num_rows()
- #729 Fixed issue with num_rows() and wait_for
- #728 Add replace_calcite_regex function to the join condition
- #721 Handling multi-partition output
- #750 Each table scan now has its own data loader
- #740 Normalizing types for UNION ALL
- #744 Fix unit tests
- #743 Workaround for interops 64 index plan limitation
- #763 Implemented ability to set the folder for all log files
- #757 Ensure GPU portability (so we can run on any cloud instance with GPU)
- #753 Fix for host memory threshold parameter with Dask envs
- #801 Fix build with new cudf 0.15 and arrow 0.17.1
- #809 Fix conda build issues
- #828 Fix gpuci issues and improve tooling to debug gpuci related issues
- #867 Fix boost dependencie issues
- #785 Add script for Manual Testing Artifacts.
- #931 Add script for error messages validation.
- #932 Import pydrill and pyspark only when its generator or full mode.

88 changes: 64 additions & 24 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,29 +70,42 @@ bc.sql('SELECT passenger_count, trip_distance FROM taxi LIMIT 2')
## Documentation
You can find our full documentation at [docs.blazingdb.com](https://docs.blazingdb.com/docs).

# Prerequisites
* [Anaconda or Miniconda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html) installed
* OS Support
* Ubuntu 16.04/18.04 LTS
* CentOS 7
* GPU Support
* Pascal or Better
* Compute Capability >= 6.0
* CUDA Support
* 10.1.2
* 10.2
* Python Support
* 3.7
* 3.8

# Install Using Conda
BlazingSQL can be installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the [blazingsql](https://anaconda.org/blazingsql/) channel:

Note: BlazingSQL is supported only on Linux, and with Python version 3.6 and 3.7.

## Stable Version
```bash
conda install -c blazingsql/label/cuda$CUDA_VERSION -c blazingsql -c rapidsai -c nvidia -c conda-forge -c defaults blazingsql python=$PYTHON_VERSION
conda install -c blazingsql/label/cuda$CUDA_VERSION -c rapidsai -c nvidia -c conda-forge -c defaults blazingsql python=$PYTHON_VERSION
```
Where $CUDA_VERSION is 10.0, 10.1 or 10.2 and $PYTHON_VERSION is 3.6 or 3.7
*For example for CUDA 10.0 and Python 3.7:*
Where $CUDA_VERSION is 10.1, 10.2 or 11.0 and $PYTHON_VERSION is 3.7 or 3.8
*For example for CUDA 10.1 and Python 3.7:*
```bash
conda install -c blazingsql/label/cuda10.0 -c blazingsql -c rapidsai -c nvidia -c conda-forge -c defaults blazingsql python=3.7
conda install -c blazingsql/label/cuda10.1 -c rapidsai -c nvidia -c conda-forge -c defaults blazingsql python=3.7
```

## Nightly Version
```bash
conda install -c blazingsql-nightly/label/cuda$CUDA_VERSION -c blazingsql-nightly -c rapidsai-nightly -c nvidia -c conda-forge -c defaults blazingsql python=$PYTHON_VERSION
conda install -c blazingsql-nightly/label/cuda$CUDA_VERSION -c rapidsai-nightly -c nvidia -c conda-forge -c defaults blazingsql python=$PYTHON_VERSION
```
Where $CUDA_VERSION is 10.0, 10.1 or 10.2 and $PYTHON_VERSION is 3.6 or 3.7
*For example for CUDA 10.0 and Python 3.7:*
Where $CUDA_VERSION is 10.1, 10.2 or 11.0 and $PYTHON_VERSION is 3.7 or 3.8
*For example for CUDA 10.1 and Python 3.7:*
```bash
conda install -c blazingsql-nightly/label/cuda10.0 -c blazingsql-nightly -c rapidsai-nightly -c nvidia -c conda-forge -c defaults blazingsql python=3.7
conda install -c blazingsql-nightly/label/cuda10.1 -c rapidsai-nightly -c nvidia -c conda-forge -c defaults blazingsql python=3.7
```

# Build/Install from Source (Conda Environment)
Expand All @@ -105,17 +118,15 @@ This is the recommended way of building all of the BlazingSQL components and dep
conda create -n bsql python=$PYTHON_VERSION
conda activate bsql
conda install --yes -c conda-forge openjdk=8.0 maven cmake gtest gmock rapidjson cppzmq cython=0.29 jpype1 netifaces pyhive
conda install --yes -c conda-forge -c blazingsql bsql-toolchain
conda install --yes -c rapidsai -c nvidia -c conda-forge -c defaults cudf=0.14 dask-cudf=0.14 dask-cuda=0.14 cudatoolkit=$CUDA_VERSION
conda install --yes -c rapidsai -c nvidia -c conda-forge -c defaults cudf=0.15 dask-cudf=0.15 dask-cuda=0.15 cudatoolkit=$CUDA_VERSION
```
Where $CUDA_VERSION is 10.0, 10.1 or 10.2 and $PYTHON_VERSION is 3.6 or 3.7
*For example for CUDA 10.0 and Python 3.7:*
Where $CUDA_VERSION is is 10.1, 10.2 or 11.0 and $PYTHON_VERSION is 3.7 or 3.8
*For example for CUDA 10.1 and Python 3.7:*
```bash
conda create -n bsql python=3.7
conda activate bsql
conda install --yes -c conda-forge openjdk=8.0 maven cmake gtest gmock rapidjson cppzmq cython=0.29 jpype1 netifaces pyhive
conda install --yes -c conda-forge -c blazingsql bsql-toolchain
conda install --yes -c rapidsai -c nvidia -c conda-forge -c defaults cudf=0.14 dask-cudf=0.14 dask-cuda=0.14 cudatoolkit=10.0
conda install --yes -c rapidsai -c nvidia -c conda-forge -c defaults cudf=0.15 dask-cudf=0.15 dask-cuda=0.15 cudatoolkit=10.1
```

### Build
Expand All @@ -139,18 +150,19 @@ $CONDA_PREFIX now has a folder for the blazingsql repository.
```bash
conda create -n bsql python=$PYTHON_VERSION
conda activate bsql
conda install --yes -c conda-forge openjdk=8.0 maven cmake gtest gmock rapidjson cppzmq cython=0.29 jpype1 netifaces pyhive
conda install --yes -c conda-forge -c blazingsql-nightly bsql-toolchain=0.15
conda install --yes -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcudf=0.15 cudf=0.15 dask-cudf=0.15 dask-cuda=0.15 cudatoolkit=$CUDA_VERSION

conda install --yes -c conda-forge google-cloud-cpp ninja
conda install --yes -c rapidsai-nightly -c nvidia -c conda-forge -c defaults dask-cuda=0.16 dask-cudf=0.16 cudf=0.16 python=3.7 cudatoolkit=$CUDA_VERSION
conda install --yes -c conda-forge cmake gtest gmock cppzmq cython=0.29 openjdk=8.0 maven thrift=0.13.0 jpype1 netifaces pyhive
```
Where $CUDA_VERSION is 10.0, 10.1 or 10.2 and $PYTHON_VERSION is 3.6 or 3.7
*For example for CUDA 10.0 and Python 3.7:*
Where $CUDA_VERSION is is 10.1, 10.2 or 11.0 and $PYTHON_VERSION is 3.7 or 3.8
*For example for CUDA 10.1 and Python 3.7:*
```bash
conda create -n bsql python=3.7
conda activate bsql
conda install --yes -c conda-forge openjdk=8.0 maven cmake gtest gmock rapidjson cppzmq cython=0.29 jpype1 netifaces pyhive
conda install --yes -c conda-forge -c blazingsql-nightly bsql-toolchain=0.15
conda install --yes -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcudf=0.15 cudf=0.15 dask-cudf=0.15 dask-cuda=0.15 cudatoolkit=10.0
conda install --yes -c conda-forge google-cloud-cpp ninja
conda install --yes -c rapidsai-nightly -c nvidia -c conda-forge -c defaults dask-cuda=0.16 dask-cudf=0.16 cudf=0.16 python=3.7 cudatoolkit=10.1
conda install --yes -c conda-forge cmake gtest gmock cppzmq cython=0.29 openjdk=8.0 maven thrift=0.13.0 jpype1 netifaces pyhive
```

### Build
Expand All @@ -167,6 +179,34 @@ NOTE: You can do `./build.sh -h` to see more build options.

$CONDA_PREFIX now has a folder for the blazingsql repository.

#### Storage plugins
To build without the storage plugins (AWS S3, Google Cloud Storage) use the next arguments:
```bash
# Disable all storage plugins
./build.sh disable-aws-s3 disable-google-gs

# Disable AWS S3 storage plugin
./build.sh disable-aws-s3

# Disable Google Cloud Storage plugin
./build.sh disable-google-gs
```
NOTE: By disabling the storage plugins you don't need to install previously AWS SDK C++ or Google Cloud Storage (neither any of its dependencies).

# Documentation
User guides and public APIs documentation can be found at [here](https://docs.blazingdb.com/docs)

Our internal code architecture can be built using Spinx.
```bash
pip install recommonmark exhale
conda install -c conda-forge doxygen
cd $CONDA_PREFIX
cd blazingsql/docs
make html
```
The generated documentation can be viewed in a browser at `blazingsql/docs/_build/html/index.html`


# Community
## Contributing
Have questions or feedback? Post a [new github issue](https://github.com/blazingdb/blazingsql/issues/new/choose).
Expand Down
7 changes: 7 additions & 0 deletions algebra/blazingdb-calcite-application/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>

<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-dbcp2</artifactId>
<version>2.6.0</version>
</dependency>

</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
import org.apache.calcite.rel.rules.ProjectMergeRule;
import org.apache.calcite.rel.rules.ProjectRemoveRule;
import org.apache.calcite.rel.rules.AggregateReduceFunctionsRule;
import org.apache.calcite.rel.rules.ReduceExpressionsRule;
import org.apache.calcite.rex.RexExecutorImpl;
import org.apache.calcite.rel.type.RelDataTypeSystem;
import org.apache.calcite.schema.SchemaPlus;
import org.apache.calcite.sql.SqlNode;
Expand Down Expand Up @@ -182,11 +184,16 @@ public RelationalAlgebraGenerator(FrameworkConfig frameworkConfig, HepProgram he
.addRuleInstance(FilterAggregateTransposeRule.INSTANCE)
.addRuleInstance(FilterJoinRule.JoinConditionPushRule.FILTER_ON_JOIN)
.addRuleInstance(FilterJoinRule.JoinConditionPushRule.JOIN)
.addRuleInstance(ProjectMergeRule.INSTANCE)
.addRuleInstance(FilterMergeRule.INSTANCE)
.addRuleInstance(ProjectJoinTransposeRule.INSTANCE)
.addRuleInstance(ProjectFilterTransposeRule.INSTANCE)
.addRuleInstance(ProjectMergeRule.INSTANCE)
.addRuleInstance(ProjectRemoveRule.INSTANCE)

//The following three rules evaluate expressions in Projects and Filters
.addRuleInstance(ReduceExpressionsRule.PROJECT_INSTANCE)
.addRuleInstance(ReduceExpressionsRule.FILTER_INSTANCE)

.addRuleInstance(ProjectTableScanRule.INSTANCE)
.addRuleInstance(FilterTableScanRule.INSTANCE)
.addRuleInstance(FilterRemoveIsNotDistinctFromRule.INSTANCE)
Expand All @@ -201,6 +208,7 @@ public RelationalAlgebraGenerator(FrameworkConfig frameworkConfig, HepProgram he
}

final HepPlanner hepPlanner = new HepPlanner(program, config.getContext());
nonOptimizedPlan.getCluster().getPlanner().setExecutor(new RexExecutorImpl(null));
hepPlanner.setRoot(nonOptimizedPlan);

planner.close();
Expand Down Expand Up @@ -240,11 +248,13 @@ public RelationalAlgebraGenerator(FrameworkConfig frameworkConfig, HepProgram he
}catch(SqlValidationException ex){
//System.out.println(ex.getMessage());
//System.out.println("Found validation err!");
return "fail: \n " + ex.getMessage();
throw ex;
//return "fail: \n " + ex.getMessage();
}catch(SqlSyntaxException ex){
//System.out.println(ex.getMessage());
//System.out.println("Found syntax err!");
return "fail: \n " + ex.getMessage();
throw ex;
//return "fail: \n " + ex.getMessage();
} catch(Exception ex) {
//System.out.println(ex.toString());
//System.out.println(ex.getMessage());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ public SqlValidationException(final String queryString, final ValidationExceptio

private static String
description(final String queryString, final String message) {

return message;

String[] a = message.split("(\\d+): ");
return a[a.length-1]; //TODO William, Felipe, Rommel, Percy use a better regular expression / approach
}
}
Loading

0 comments on commit 1932065

Please sign in to comment.