diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000..8920ed8b
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,5 @@
+---
+BasedOnStyle: Google
+---
+Language: Cpp
+ColumnLimit: 80
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..76af67b1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+test_case/
diff --git a/3rdparty/cub-1.8.0/.cproject b/3rdparty/cub-1.8.0/.cproject
new file mode 100644
index 00000000..e76d1da6
--- /dev/null
+++ b/3rdparty/cub-1.8.0/.cproject
@@ -0,0 +1,1223 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/3rdparty/cub-1.8.0/.project b/3rdparty/cub-1.8.0/.project
new file mode 100644
index 00000000..7aca9e04
--- /dev/null
+++ b/3rdparty/cub-1.8.0/.project
@@ -0,0 +1,27 @@
+
+
+ GIT_CUB
+
+
+
+
+
+ org.eclipse.cdt.managedbuilder.core.genmakebuilder
+ clean,full,incremental,
+
+
+
+
+ org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder
+ full,incremental,
+
+
+
+
+
+ org.eclipse.cdt.core.cnature
+ org.eclipse.cdt.managedbuilder.core.managedBuildNature
+ org.eclipse.cdt.managedbuilder.core.ScannerConfigNature
+ org.eclipse.cdt.core.ccnature
+
+
diff --git a/3rdparty/cub-1.8.0/.settings/.gitignore b/3rdparty/cub-1.8.0/.settings/.gitignore
new file mode 100644
index 00000000..d81d4c41
--- /dev/null
+++ b/3rdparty/cub-1.8.0/.settings/.gitignore
@@ -0,0 +1 @@
+/language.settings.xml
diff --git a/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.codan.core.prefs b/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.codan.core.prefs
new file mode 100644
index 00000000..64da7771
--- /dev/null
+++ b/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.codan.core.prefs
@@ -0,0 +1,72 @@
+eclipse.preferences.version=1
+org.eclipse.cdt.codan.checkers.errnoreturn=Warning
+org.eclipse.cdt.codan.checkers.errnoreturn.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},implicit\=>false}
+org.eclipse.cdt.codan.checkers.errreturnvalue=Error
+org.eclipse.cdt.codan.checkers.errreturnvalue.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.checkers.nocommentinside=-Error
+org.eclipse.cdt.codan.checkers.nocommentinside.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.checkers.nolinecomment=-Error
+org.eclipse.cdt.codan.checkers.nolinecomment.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.checkers.noreturn=Error
+org.eclipse.cdt.codan.checkers.noreturn.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},implicit\=>false}
+org.eclipse.cdt.codan.internal.checkers.AbstractClassCreation=Error
+org.eclipse.cdt.codan.internal.checkers.AbstractClassCreation.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.AmbiguousProblem=Error
+org.eclipse.cdt.codan.internal.checkers.AmbiguousProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.AssignmentInConditionProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.AssignmentInConditionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.AssignmentToItselfProblem=Error
+org.eclipse.cdt.codan.internal.checkers.AssignmentToItselfProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.CaseBreakProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.CaseBreakProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},no_break_comment\=>"no break",last_case_param\=>true,empty_case_param\=>false}
+org.eclipse.cdt.codan.internal.checkers.CatchByReference=Warning
+org.eclipse.cdt.codan.internal.checkers.CatchByReference.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},unknown\=>false,exceptions\=>()}
+org.eclipse.cdt.codan.internal.checkers.CircularReferenceProblem=Error
+org.eclipse.cdt.codan.internal.checkers.CircularReferenceProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.ClassMembersInitialization=Warning
+org.eclipse.cdt.codan.internal.checkers.ClassMembersInitialization.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},skip\=>true}
+org.eclipse.cdt.codan.internal.checkers.FieldResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.FieldResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.FunctionResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.FunctionResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.InvalidArguments=Error
+org.eclipse.cdt.codan.internal.checkers.InvalidArguments.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.InvalidTemplateArgumentsProblem=Error
+org.eclipse.cdt.codan.internal.checkers.InvalidTemplateArgumentsProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.LabelStatementNotFoundProblem=Error
+org.eclipse.cdt.codan.internal.checkers.LabelStatementNotFoundProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.MemberDeclarationNotFoundProblem=Error
+org.eclipse.cdt.codan.internal.checkers.MemberDeclarationNotFoundProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.MethodResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.MethodResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.NamingConventionFunctionChecker=-Info
+org.eclipse.cdt.codan.internal.checkers.NamingConventionFunctionChecker.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},pattern\=>"^[a-z]",macro\=>true,exceptions\=>()}
+org.eclipse.cdt.codan.internal.checkers.NonVirtualDestructorProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.NonVirtualDestructorProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.OverloadProblem=Error
+org.eclipse.cdt.codan.internal.checkers.OverloadProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.RedeclarationProblem=Error
+org.eclipse.cdt.codan.internal.checkers.RedeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.RedefinitionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.RedefinitionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.ReturnStyleProblem=-Warning
+org.eclipse.cdt.codan.internal.checkers.ReturnStyleProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.ScanfFormatStringSecurityProblem=-Warning
+org.eclipse.cdt.codan.internal.checkers.ScanfFormatStringSecurityProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.StatementHasNoEffectProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.StatementHasNoEffectProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true,exceptions\=>()}
+org.eclipse.cdt.codan.internal.checkers.SuggestedParenthesisProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.SuggestedParenthesisProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},paramNot\=>false}
+org.eclipse.cdt.codan.internal.checkers.SuspiciousSemicolonProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.SuspiciousSemicolonProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},else\=>false,afterelse\=>false}
+org.eclipse.cdt.codan.internal.checkers.TypeResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.TypeResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.UnusedFunctionDeclarationProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.UnusedFunctionDeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true}
+org.eclipse.cdt.codan.internal.checkers.UnusedStaticFunctionProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.UnusedStaticFunctionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true}
+org.eclipse.cdt.codan.internal.checkers.UnusedVariableDeclarationProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.UnusedVariableDeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true,exceptions\=>("@(\#)","$Id")}
+org.eclipse.cdt.codan.internal.checkers.VariableResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.VariableResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+useParentScope=false
diff --git a/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.core.prefs b/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.core.prefs
new file mode 100644
index 00000000..80b8e65c
--- /dev/null
+++ b/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.core.prefs
@@ -0,0 +1,177 @@
+eclipse.preferences.version=1
+indexer/indexAllFiles=true
+indexer/indexAllHeaderVersions=false
+indexer/indexAllVersionsSpecificHeaders=
+indexer/indexOnOpen=false
+indexer/indexUnusedHeadersWithAlternateLang=false
+indexer/indexUnusedHeadersWithDefaultLang=true
+indexer/indexerId=org.eclipse.cdt.core.fastIndexer
+indexer/skipFilesLargerThanMB=8
+indexer/skipImplicitReferences=false
+indexer/skipIncludedFilesLargerThanMB=16
+indexer/skipMacroReferences=false
+indexer/skipReferences=false
+indexer/skipTypeReferences=false
+indexer/useHeuristicIncludeResolution=true
+org.eclipse.cdt.core.formatter.alignment_for_arguments_in_method_invocation=16
+org.eclipse.cdt.core.formatter.alignment_for_assignment=16
+org.eclipse.cdt.core.formatter.alignment_for_base_clause_in_type_declaration=48
+org.eclipse.cdt.core.formatter.alignment_for_binary_expression=16
+org.eclipse.cdt.core.formatter.alignment_for_compact_if=0
+org.eclipse.cdt.core.formatter.alignment_for_conditional_expression=48
+org.eclipse.cdt.core.formatter.alignment_for_conditional_expression_chain=18
+org.eclipse.cdt.core.formatter.alignment_for_constructor_initializer_list=0
+org.eclipse.cdt.core.formatter.alignment_for_declarator_list=16
+org.eclipse.cdt.core.formatter.alignment_for_enumerator_list=48
+org.eclipse.cdt.core.formatter.alignment_for_expression_list=0
+org.eclipse.cdt.core.formatter.alignment_for_expressions_in_array_initializer=16
+org.eclipse.cdt.core.formatter.alignment_for_member_access=0
+org.eclipse.cdt.core.formatter.alignment_for_overloaded_left_shift_chain=16
+org.eclipse.cdt.core.formatter.alignment_for_parameters_in_method_declaration=48
+org.eclipse.cdt.core.formatter.alignment_for_throws_clause_in_method_declaration=48
+org.eclipse.cdt.core.formatter.brace_position_for_array_initializer=next_line
+org.eclipse.cdt.core.formatter.brace_position_for_block=next_line
+org.eclipse.cdt.core.formatter.brace_position_for_block_in_case=end_of_line
+org.eclipse.cdt.core.formatter.brace_position_for_method_declaration=next_line
+org.eclipse.cdt.core.formatter.brace_position_for_namespace_declaration=end_of_line
+org.eclipse.cdt.core.formatter.brace_position_for_switch=end_of_line
+org.eclipse.cdt.core.formatter.brace_position_for_type_declaration=next_line
+org.eclipse.cdt.core.formatter.comment.min_distance_between_code_and_line_comment=1
+org.eclipse.cdt.core.formatter.comment.never_indent_line_comments_on_first_column=true
+org.eclipse.cdt.core.formatter.comment.preserve_white_space_between_code_and_line_comments=true
+org.eclipse.cdt.core.formatter.compact_else_if=true
+org.eclipse.cdt.core.formatter.continuation_indentation=1
+org.eclipse.cdt.core.formatter.continuation_indentation_for_array_initializer=1
+org.eclipse.cdt.core.formatter.format_guardian_clause_on_one_line=false
+org.eclipse.cdt.core.formatter.indent_access_specifier_compare_to_type_header=false
+org.eclipse.cdt.core.formatter.indent_access_specifier_extra_spaces=0
+org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_access_specifier=true
+org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_namespace_header=false
+org.eclipse.cdt.core.formatter.indent_breaks_compare_to_cases=true
+org.eclipse.cdt.core.formatter.indent_declaration_compare_to_template_header=false
+org.eclipse.cdt.core.formatter.indent_empty_lines=false
+org.eclipse.cdt.core.formatter.indent_statements_compare_to_block=true
+org.eclipse.cdt.core.formatter.indent_statements_compare_to_body=true
+org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_cases=true
+org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_switch=false
+org.eclipse.cdt.core.formatter.indentation.size=4
+org.eclipse.cdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_after_template_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_at_end_of_file_if_missing=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_catch_in_try_statement=insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_colon_in_constructor_initializer_list=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_else_in_if_statement=insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_identifier_in_function_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_while_in_do_statement=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_in_empty_block=insert
+org.eclipse.cdt.core.formatter.insert_space_after_assignment_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_after_binary_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_arguments=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_parameters=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_brace_in_block=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_paren_in_cast=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_base_clause=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_case=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_labeled_statement=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_base_types=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_declarator_list=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_enum_declarations=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_expression_list=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_throws=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_arguments=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_parameters=insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_brace_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_bracket=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_cast=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_catch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_exception_specification=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_for=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_if=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_switch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_while=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_postfix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_prefix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_question_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_after_semicolon_in_for=insert
+org.eclipse.cdt.core.formatter.insert_space_after_unary_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_assignment_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_before_binary_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_brace_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_bracket=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_cast=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_catch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_exception_specification=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_for=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_if=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_switch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_while=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_base_clause=insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_case=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_default=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_labeled_statement=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_base_types=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_declarator_list=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_enum_declarations=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_expression_list=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_throws=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_block=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_method_declaration=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_namespace_declaration=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_switch=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_type_declaration=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_bracket=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_catch=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_exception_specification=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_for=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_if=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_switch=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_while=insert
+org.eclipse.cdt.core.formatter.insert_space_before_postfix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_prefix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_question_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_before_semicolon=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_semicolon_in_for=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_unary_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_braces_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_brackets=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_exception_specification=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.join_wrapped_lines=true
+org.eclipse.cdt.core.formatter.keep_else_statement_on_same_line=false
+org.eclipse.cdt.core.formatter.keep_empty_array_initializer_on_one_line=false
+org.eclipse.cdt.core.formatter.keep_imple_if_on_one_line=true
+org.eclipse.cdt.core.formatter.keep_then_statement_on_same_line=false
+org.eclipse.cdt.core.formatter.lineSplit=80
+org.eclipse.cdt.core.formatter.number_of_empty_lines_to_preserve=1
+org.eclipse.cdt.core.formatter.put_empty_statement_on_new_line=true
+org.eclipse.cdt.core.formatter.tabulation.char=space
+org.eclipse.cdt.core.formatter.tabulation.size=4
+org.eclipse.cdt.core.formatter.use_tabs_only_for_leading_indentations=false
diff --git a/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.ui.prefs b/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.ui.prefs
new file mode 100644
index 00000000..ca73f82d
--- /dev/null
+++ b/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.ui.prefs
@@ -0,0 +1,3 @@
+eclipse.preferences.version=1
+formatter_profile=_B40C
+formatter_settings_version=1
diff --git a/3rdparty/cub-1.8.0/.settings/org.eclipse.core.runtime.prefs b/3rdparty/cub-1.8.0/.settings/org.eclipse.core.runtime.prefs
new file mode 100644
index 00000000..2e6330e7
--- /dev/null
+++ b/3rdparty/cub-1.8.0/.settings/org.eclipse.core.runtime.prefs
@@ -0,0 +1,4 @@
+content-types/enabled=true
+content-types/org.eclipse.cdt.core.cxxHeader/file-extensions=cuh
+content-types/org.eclipse.cdt.core.cxxSource/file-extensions=cu
+eclipse.preferences.version=1
diff --git a/3rdparty/cub-1.8.0/CHANGE_LOG.TXT b/3rdparty/cub-1.8.0/CHANGE_LOG.TXT
new file mode 100644
index 00000000..ed7f3957
--- /dev/null
+++ b/3rdparty/cub-1.8.0/CHANGE_LOG.TXT
@@ -0,0 +1,403 @@
+1.8.0 02/15/2018
+ - API change: change to the interfaces of cub::ShuffleIndex, cub::ShuffleUp, and
+ cub::ShuffleDown to better compute the PTX shfl control constant for
+ logical warps smaller than 32 threads
+ - Bug fixes:
+ - Issue #112: Bug in WarpScan's broadcast of warp-wide aggregate for
+ logical warps < 32 threads
+
+
+//-----------------------------------------------------------------------------
+
+1.7.5 02/08/2018
+ - Added radix-sorting support for __half keys
+ - Updated sorting policies for improved 8b-key performance
+ - Bug fixes:
+ - Syntax tweaks to mollify Clang
+ - Issue #127: DeviceRunLengthEncode::Encode returns wrong results
+ - Issue #128: 7-bit sorting passes fail for sm61 w/ large-values
+
+
+//-----------------------------------------------------------------------------
+
+1.7.4 09/20/2017
+ - Bug fixes:
+ - Issue #114: Can't pair non-trivially-constructible values in radix sort
+ - Issue #115: WarpReduce segmented reduction broken in CUDA 9 for logical warp sizes < 32
+
+//-----------------------------------------------------------------------------
+
+1.7.3 08/28/2017
+ - Bug fixes:
+ - Issue #110: DeviceHistogram null-pointer exception bug for iterator inputs
+
+//-----------------------------------------------------------------------------
+
+1.7.2 08/26/2017
+ - Bug fixes:
+ - Issue #104: Device-wide reduction is now "run-to-run" deterministic for
+ pseudo-associative reduction operators (like floating point addition)
+
+//-----------------------------------------------------------------------------
+
+1.7.1 08/18/2017
+ - Updated Volta radix sorting tuning policies
+ - Bug fixes:
+ - Issue #104 (uint64_t warp-reduce broken for cub 1.7.0 on cuda 8 and older)
+ - Issue #103 (Can't mix Thrust 9.0 and CUB)
+ - Issue #102 (CUB pulls in windows.h which defines min/max macros that conflict with std::min/std::max)
+ - Issue #99 (Radix sorting crashes NVCC on Windows 10 for SM52)
+ - Issue #98 (cuda-memcheck: --tool initcheck failed with lineOfSight)
+ - Issue #94 (Git clone size)
+ - Issue #93 (accept iterators for segment offsets)
+ - Issue #87 (CUB uses anonymous unions which is not valid C++)
+ - Issue #44 (Check for C++ 11 should be changed that Visual Studio 2013 is also recognized as C++ 11 capable)
+
+//-----------------------------------------------------------------------------
+
+1.7.0 06/07/2017
+ - Compatible with CUDA9 and SM7.x (Volta) independent thread scheduling
+ - API change: remove cub::WarpAll() and cub::WarpAny(). These functions served to
+ emulate __all and __any functionality for SM1.x devices, which did not have those
+ operations. However, the SM1.x devices are now deprecated in CUDA, and the
+ interfaces of the these two functions are now lacking the lane-mask needed
+ for collectives to run on Volta SMs having independent thread scheduling.
+ - Bug fixes:
+ - Issue #86 Incorrect results with ReduceByKey
+
+//-----------------------------------------------------------------------------
+
+1.6.4 12/06/2016
+ - Updated sm_5x, sm_6x tuning policies for radix sorting (3.5B and 3.4B
+ 32b keys/s on TitanX and GTX 1080, respectively)
+ - Bug fixes:
+ - Restore fence work-around for scan (reduce-by-key, etc.) hangs
+ in CUDA 8.5
+ - Issue 65: DeviceSegmentedRadixSort should allow inputs to have
+ pointer-to-const type
+ - Mollify Clang device-side warnings
+ - Remove out-dated VC project files
+
+//-----------------------------------------------------------------------------
+
+1.6.3 11/20/2016
+ - API change: BlockLoad and BlockStore are now templated by the local
+ data type, instead of the Iterator type. This allows for output iterators
+ having \p void as their \p value_type (e.g., discard iterators).
+ - Updated GP100 tuning policies for radix sorting (6.2B 32b keys/s)
+ - Bug fixes:
+ - Issue #74: Warpreduce executes reduction operator for out-of-bounds items
+ - Issue #72 (cub:InequalityWrapper::operator() should be non-const)
+ - Issue #71 (KeyVairPair won't work if Key has non-trivial ctor)
+ - Issue #70 1.5.3 breaks BlockScan API. Retroactively reversioned
+ from v1.5.3 -> v1.6 to appropriately indicate API change.
+ - Issue #69 cub::BlockStore::Store doesn't compile if OutputIteratorT::value_type != T
+ - Issue #68 (cub::TilePrefixCallbackOp::WarpReduce doesn't permit ptx
+ arch specialization)
+ - Improved support for Win32 platforms (warnings, alignment, etc)
+
+//-----------------------------------------------------------------------------
+
+1.6.2 (was 1.5.5) 10/25/2016
+ - Updated Pascal tuning policies for radix sorting
+ - Bug fixes:
+ - Fix for arm64 compilation of caching allocator
+
+//-----------------------------------------------------------------------------
+
+1.6.1 (was 1.5.4) 10/14/2016
+ - Bug fixes:
+ - Fix for radix sorting bug introduced by scan refactorization
+
+//-----------------------------------------------------------------------------
+
+1.6.0 (was 1.5.3) 10/11/2016
+ - API change: Device/block/warp-wide exclusive scans have been revised to now
+ accept an "initial value" (instead of an "identity value") for seeding the
+ computation with an arbitrary prefix.
+ - API change: Device-wide reductions and scans can now have input sequence types that are
+ different from output sequence types (as long as they are coercible)
+ value") for seeding the computation with an arbitrary prefix
+ - Reduce repository size (move doxygen binary to doc repository)
+ - Minor reductions in block-scan instruction count
+ - Bug fixes:
+ - Issue #55: warning in cub/device/dispatch/dispatch_reduce_by_key.cuh
+ - Issue #59: cub::DeviceScan::ExclusiveSum can't prefix sum of float into double
+ - Issue #58: Infinite loop in cub::CachingDeviceAllocator::NearestPowerOf
+ - Issue #47: Caching allocator needs to clean up cuda error upon successful retry
+ - Issue #46: Very high amount of needed memory from the cub::DeviceHistogram::HistogramEven routine
+ - Issue #45: Caching Device Allocator fails with debug output enabled
+ - Fix for generic-type reduce-by-key warpscan (sm3.x and newer)
+
+//-----------------------------------------------------------------------------
+
+1.5.2 03/21/2016
+ - Improved medium-size scan performance for sm5x (Maxwell)
+ - Refactored caching allocator for device memory
+ - Spends less time locked
+ - Failure to allocate a block from the runtime will retry once after
+ freeing cached allocations
+ - Now respects max-bin (issue where blocks in excess of max-bin were
+ still being retained in free cache)
+ - Uses C++11 mutex when available
+ - Bug fixes:
+ - Fix for generic-type reduce-by-key warpscan (sm3.x and newer)
+
+//-----------------------------------------------------------------------------
+
+1.5.1 12/28/2015
+ - Bug fixes:
+ - Fix for incorrect DeviceRadixSort output for some small problems on
+ Maxwell SM52 architectures
+ - Fix for macro redefinition warnings when compiling with Thrust sort
+
+//-----------------------------------------------------------------------------
+
+1.5.0 12/14/2015
+ - New Features:
+ - Added new segmented device-wide operations for device-wide sort and
+ reduction primitives.
+ - Bug fixes:
+ - Fix for Git Issue 36 (Compilation error with GCC 4.8.4 nvcc 7.0.27) and
+ Forums thread (ThreadLoad generates compiler errors when loading from
+ pointer-to-const)
+ - Fix for Git Issue 29 (DeviceRadixSort::SortKeys yields compiler
+ errors)
+ - Fix for Git Issue 26 (CUDA error: misaligned address after
+ cub::DeviceRadixSort::SortKeys())
+ - Fix for incorrect/crash on 0-length problems, e.g., Git Issue 25 (Floating
+ point exception (core dumped) during cub::DeviceRadixSort::SortKeys)
+ - Fix for CUDA 7.5 issues on SM 5.2 with SHFL-based warp-scan and warp-reduction
+ on non-primitive data types (e.g., user-defined structs)
+ - Fix for small radix sorting problems where 0 temporary bytes were
+ required and users code was invoking malloc(0) on some systems where
+ that returns NULL. (Impl assumed was asking for size again and was not
+ running the sort.)
+
+//-----------------------------------------------------------------------------
+
+1.4.1 04/13/2015
+ - Bug fixes:
+ - Fixes for CUDA 7.0 issues with SHFL-based warp-scan and warp-reduction
+ on non-primitive data types (e.g., user-defined structs)
+ - Fixes for minor CUDA 7.0 performance regressions in cub::DeviceScan,
+ DeviceReduceByKey
+ - Fixes to allow cub::DeviceRadixSort and cub::BlockRadixSort on bool types
+ - Remove requirement for callers to define the CUB_CDP macro
+ when invoking CUB device-wide rountines using CUDA dynamic parallelism
+ - Fix for headers not being included in the proper order (or missing includes)
+ for some block-wide functions
+
+//-----------------------------------------------------------------------------
+
+1.4.0 03/18/2015
+ - New Features:
+ - Support and performance tuning for new Maxwell GPU architectures
+ - Updated cub::DeviceHistogram implementation that provides the same
+ "histogram-even" and "histogram-range" functionality as IPP/NPP.
+ Provides extremely fast and, perhaps more importantly, very
+ uniform performance response across diverse real-world datasets,
+ including pathological (homogeneous) sample distributions (resilience)
+ - New cub::DeviceSpmv methods for multiplying sparse matrices by
+ dense vectors, load-balanced using a merge-based parallel decomposition.
+ - New cub::DeviceRadixSort sorting entry-points that always return
+ the sorted output into the specified buffer (as opposed to the
+ cub::DoubleBuffer in which it could end up in either buffer)
+ - New cub::DeviceRunLengthEncode::NonTrivialRuns for finding the starting
+ offsets and lengths of all non-trivial runs (i.e., length > 1) of keys in
+ a given sequence. (Useful for top-down partitioning algorithms like
+ MSD sorting of very-large keys.)
+
+//-----------------------------------------------------------------------------
+
+1.3.2 07/28/2014
+ - Bug fixes:
+ - Fix for cub::DeviceReduce where reductions of small problems
+ (small enough to only dispatch a single thread block) would run in
+ the default stream (stream zero) regardless of whether an alternate
+ stream was specified.
+
+//-----------------------------------------------------------------------------
+
+1.3.1 05/23/2014
+ - Bug fixes:
+ - Workaround for a benign WAW race warning reported by cuda-memcheck
+ in BlockScan specialized for BLOCK_SCAN_WARP_SCANS algorithm.
+ - Fix for bug in DeviceRadixSort where the algorithm may sort more
+ key bits than the caller specified (up to the nearest radix digit).
+ - Fix for ~3% DeviceRadixSort performance regression on Kepler and
+ Fermi that was introduced in v1.3.0.
+
+//-----------------------------------------------------------------------------
+
+1.3.0 05/12/2014
+ - New features:
+ - CUB's collective (block-wide, warp-wide) primitives underwent a minor
+ interface refactoring:
+ - To provide the appropriate support for multidimensional thread blocks,
+ The interfaces for collective classes are now template-parameterized
+ by X, Y, and Z block dimensions (with BLOCK_DIM_Y and BLOCK_DIM_Z being
+ optional, and BLOCK_DIM_X replacing BLOCK_THREADS). Furthermore, the
+ constructors that accept remapped linear thread-identifiers have been
+ removed: all primitives now assume a row-major thread-ranking for
+ multidimensional thread blocks.
+ - To allow the host program (compiled by the host-pass) to
+ accurately determine the device-specific storage requirements for
+ a given collective (compiled for each device-pass), the interfaces
+ for collective classes are now (optionally) template-parameterized
+ by the desired PTX compute capability. This is useful when
+ aliasing collective storage to shared memory that has been
+ allocated dynamically by the host at the kernel call site.
+ - Most CUB programs having typical 1D usage should not require any
+ changes to accomodate these updates.
+ - Added new "combination" WarpScan methods for efficiently computing
+ both inclusive and exclusive prefix scans (and sums).
+ - Bug fixes:
+ - Fixed bug in cub::WarpScan (which affected cub::BlockScan and
+ cub::DeviceScan) where incorrect results (e.g., NAN) would often be
+ returned when parameterized for floating-point types (fp32, fp64).
+ - Workaround-fix for ptxas error when compiling with with -G flag on Linux
+ (for debug instrumentation)
+ - Misc. workaround-fixes for certain scan scenarios (using custom
+ scan operators) where code compiled for SM1x is run on newer
+ GPUs of higher compute-capability: the compiler could not tell
+ which memory space was being used collective operations and was
+ mistakenly using global ops instead of shared ops.
+
+//-----------------------------------------------------------------------------
+
+1.2.3 04/01/2014
+ - Bug fixes:
+ - Fixed access violation bug in DeviceReduce::ReduceByKey for non-primitive value types
+ - Fixed code-snippet bug in ArgIndexInputIteratorT documentation
+
+//-----------------------------------------------------------------------------
+
+1.2.2 03/03/2014
+ - New features:
+ - Added MS VC++ project solutions for device-wide and block-wide examples
+ - Performance:
+ - Added a third algorithmic variant of cub::BlockReduce for improved performance
+ when using commutative operators (e.g., numeric addition)
+ - Bug fixes:
+ - Fixed bug where inclusion of Thrust headers in a certain order prevented CUB device-wide primitives from working properly
+
+//-----------------------------------------------------------------------------
+
+1.2.0 02/25/2014
+ - New features:
+ - Added device-wide reduce-by-key (DeviceReduce::ReduceByKey, DeviceReduce::RunLengthEncode)
+ - Performance
+ - Improved DeviceScan, DeviceSelect, DevicePartition performance
+ - Documentation and testing:
+ - Compatible with CUDA 6.0
+ - Added performance-portability plots for many device-wide primitives to doc
+ - Update doc and tests to reflect iterator (in)compatibilities with CUDA 5.0 (and older) and Thrust 1.6 (and older).
+ - Bug fixes
+ - Revised the operation of temporary tile status bookkeeping for DeviceScan (and similar) to be safe for current code run on future platforms (now uses proper fences)
+ - Fixed DeviceScan bug where Win32 alignment disagreements between host and device regarding user-defined data types would corrupt tile status
+ - Fixed BlockScan bug where certain exclusive scans on custom data types for the BLOCK_SCAN_WARP_SCANS variant would return incorrect results for the first thread in the block
+ - Added workaround for TexRefInputIteratorTto work with CUDA 6.0
+
+//-----------------------------------------------------------------------------
+
+1.1.1 12/11/2013
+ - New features:
+ - Added TexObjInputIteratorT, TexRefInputIteratorT, CacheModifiedInputIteratorT, and CacheModifiedOutputIterator types for loading & storing arbitrary types through the cache hierarchy. Compatible with Thrust API.
+ - Added descending sorting to DeviceRadixSort and BlockRadixSort
+ - Added min, max, arg-min, and arg-max to DeviceReduce
+ - Added DeviceSelect (select-unique, select-if, and select-flagged)
+ - Added DevicePartition (partition-if, partition-flagged)
+ - Added generic cub::ShuffleUp(), cub::ShuffleDown(), and cub::ShuffleIndex() for warp-wide communication of arbitrary data types (SM3x+)
+ - Added cub::MaxSmOccupancy() for accurately determining SM occupancy for any given kernel function pointer
+ - Performance
+ - Improved DeviceScan and DeviceRadixSort performance for older architectures (SM10-SM30)
+ - Interface changes:
+ - Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces. The CacheModifiedInputIteratorTand CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
+ - Rename device-wide "stream_synchronous" param to "debug_synchronous" to avoid confusion about usage
+ - Documentation and testing:
+ - Added simple examples of device-wide methods
+ - Improved doxygen documentation and example snippets
+ - Improved test coverege to include up to 21,000 kernel variants and 851,000 unit tests (per architecture, per platform)
+ - Bug fixes
+ - Fixed misc DeviceScan, BlockScan, DeviceReduce, and BlockReduce bugs when operating on non-primitive types for older architectures SM10-SM13
+ - Fixed DeviceScan / WarpReduction bug: SHFL-based segmented reduction producting incorrect results for multi-word types (size > 4B) on Linux
+ - Fixed BlockScan bug: For warpscan-based scans, not all threads in the first warp were entering the prefix callback functor
+ - Fixed DeviceRadixSort bug: race condition with key-value pairs for pre-SM35 architectures
+ - Fixed DeviceRadixSort bug: incorrect bitfield-extract behavior with long keys on 64bit Linux
+ - Fixed BlockDiscontinuity bug: complation error in for types other than int32/uint32
+ - CDP (device-callable) versions of device-wide methods now report the same temporary storage allocation size requirement as their host-callable counterparts
+
+
+//-----------------------------------------------------------------------------
+
+1.0.2 08/23/2013
+ - Corrections to code snippet examples for BlockLoad, BlockStore, and BlockDiscontinuity
+ - Cleaned up unnecessary/missing header includes. You can now safely #inlude a specific .cuh (instead of cub.cuh)
+ - Bug/compilation fixes for BlockHistogram
+
+//-----------------------------------------------------------------------------
+
+1.0.1 08/08/2013
+ - New collective interface idiom (specialize::construct::invoke).
+ - Added best-in-class DeviceRadixSort. Implements short-circuiting for homogenous digit passes.
+ - Added best-in-class DeviceScan. Implements single-pass "adaptive-lookback" strategy.
+ - Significantly improved documentation (with example code snippets)
+ - More extensive regression test suit for aggressively testing collective variants
+ - Allow non-trially-constructed types (previously unions had prevented aliasing temporary storage of those types)
+ - Improved support for Kepler SHFL (collective ops now use SHFL for types larger than 32b)
+ - Better code generation for 64-bit addressing within BlockLoad/BlockStore
+ - DeviceHistogram now supports histograms of arbitrary bins
+ - Misc. fixes
+ - Workarounds for SM10 codegen issues in uncommonly-used WarpScan/Reduce specializations
+ - Updates to accommodate CUDA 5.5 dynamic parallelism
+
+
+//-----------------------------------------------------------------------------
+
+0.9.4 05/07/2013
+
+ - Fixed compilation errors for SM10-SM13
+ - Fixed compilation errors for some WarpScan entrypoints on SM30+
+ - Added block-wide histogram (BlockHistogram256)
+ - Added device-wide histogram (DeviceHistogram256)
+ - Added new BlockScan algorithm variant BLOCK_SCAN_RAKING_MEMOIZE, which
+ trades more register consumption for less shared memory I/O)
+ - Updates to BlockRadixRank to use BlockScan (which improves performance
+ on Kepler due to SHFL instruction)
+ - Allow types other than C++ primitives to be used in WarpScan::*Sum methods
+ if they only have operator + overloaded. (Previously they also required
+ to support assignment from int(0).)
+ - Update BlockReduce's BLOCK_REDUCE_WARP_REDUCTIONS algorithm to work even
+ when block size is not an even multiple of warp size
+ - Added work management utility descriptors (GridQueue, GridEvenShare)
+ - Refactoring of DeviceAllocator interface and CachingDeviceAllocator
+ implementation
+ - Misc. documentation updates and corrections.
+
+//-----------------------------------------------------------------------------
+
+0.9.2 04/04/2013
+
+ - Added WarpReduce. WarpReduce uses the SHFL instruction when applicable.
+ BlockReduce now uses this WarpReduce instead of implementing its own.
+ - Misc. fixes for 64-bit Linux compilation warnings and errors.
+ - Misc. documentation updates and corrections.
+
+//-----------------------------------------------------------------------------
+
+0.9.1 03/09/2013
+
+ - Fix for ambiguity in BlockScan::Reduce() between generic reduction and
+ summation. Summation entrypoints are now called ::Sum(), similar to the
+ convention in BlockScan.
+ - Small edits to mainpage documentation and download tracking
+
+//-----------------------------------------------------------------------------
+
+0.9.0 03/07/2013
+
+ - Intial "preview" release. CUB is the first durable, high-performance library
+ of cooperative block-level, warp-level, and thread-level primitives for CUDA
+ kernel programming. More primitives and examples coming soon!
+
\ No newline at end of file
diff --git a/3rdparty/cub-1.8.0/LICENSE.TXT b/3rdparty/cub-1.8.0/LICENSE.TXT
new file mode 100644
index 00000000..a678e64f
--- /dev/null
+++ b/3rdparty/cub-1.8.0/LICENSE.TXT
@@ -0,0 +1,24 @@
+Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
+Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the NVIDIA CORPORATION nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/3rdparty/cub-1.8.0/README.md b/3rdparty/cub-1.8.0/README.md
new file mode 100644
index 00000000..d2a338e7
--- /dev/null
+++ b/3rdparty/cub-1.8.0/README.md
@@ -0,0 +1,128 @@
+
+
About CUB
+
+Current release: v1.8.0 (02/16/2018)
+
+We recommend the [CUB Project Website](http://nvlabs.github.com/cub) for further information and examples.
+
+CUB provides state-of-the-art, reusable software components for every layer
+of the CUDA programming model:
+- [Device-wide primitives] (https://nvlabs.github.com/cub/group___device_module.html)
+ - Sort, prefix scan, reduction, histogram, etc.
+ - Compatible with CUDA dynamic parallelism
+- [Block-wide "collective" primitives] (https://nvlabs.github.com/cub/group___block_module.html)
+ - I/O, sort, prefix scan, reduction, histogram, etc.
+ - Compatible with arbitrary thread block sizes and types
+- [Warp-wide "collective" primitives] (https://nvlabs.github.com/cub/group___warp_module.html)
+ - Warp-wide prefix scan, reduction, etc.
+ - Safe and architecture-specific
+- [Thread and resource utilities](https://nvlabs.github.com/cub/group___thread_module.html)
+ - PTX intrinsics, device reflection, texture-caching iterators, caching memory allocators, etc.
+
+![Orientation of collective primitives within the CUDA software stack](http://nvlabs.github.com/cub/cub_overview.png)
+
+
+
A Simple Example
+
+```C++
+#include
+
+// Block-sorting CUDA kernel
+__global__ void BlockSortKernel(int *d_in, int *d_out)
+{
+ using namespace cub;
+
+ // Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads
+ // owning 16 integer items each
+ typedef BlockRadixSort BlockRadixSort;
+ typedef BlockLoad BlockLoad;
+ typedef BlockStore BlockStore;
+
+ // Allocate shared memory
+ __shared__ union {
+ typename BlockRadixSort::TempStorage sort;
+ typename BlockLoad::TempStorage load;
+ typename BlockStore::TempStorage store;
+ } temp_storage;
+
+ int block_offset = blockIdx.x * (128 * 16); // OffsetT for this block's ment
+
+ // Obtain a segment of 2048 consecutive keys that are blocked across threads
+ int thread_keys[16];
+ BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys);
+ __syncthreads();
+
+ // Collectively sort the keys
+ BlockRadixSort(temp_storage.sort).Sort(thread_keys);
+ __syncthreads();
+
+ // Store the sorted segment
+ BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys);
+}
+```
+
+Each thread block uses cub::BlockRadixSort to collectively sort
+its own input segment. The class is specialized by the
+data type being sorted, by the number of threads per block, by the number of
+keys per thread, and implicitly by the targeted compilation architecture.
+
+The cub::BlockLoad and cub::BlockStore classes are similarly specialized.
+Furthermore, to provide coalesced accesses to device memory, these primitives are
+configured to access memory using a striped access pattern (where consecutive threads
+simultaneously access consecutive items) and then transpose the keys into
+a [blocked arrangement](index.html#sec4sec3) of elements across threads.
+
+Once specialized, these classes expose opaque \p TempStorage member types.
+The thread block uses these storage types to statically allocate the union of
+shared memory needed by the thread block. (Alternatively these storage types
+could be aliased to global memory allocations).
+
+
+
Stable Releases
+
+CUB releases are labeled using version identifiers having three fields:
+*epoch.feature.update*. The *epoch* field corresponds to support for
+a major change in the CUDA programming model. The *feature* field
+corresponds to a stable set of features, functionality, and interface. The
+*update* field corresponds to a bug-fix or performance update for that
+feature set. At the moment, we do not publicly provide non-stable releases
+such as development snapshots, beta releases or rolling releases. (Feel free
+to contact us if you would like such things.) See the
+[CUB Project Website](http://nvlabs.github.com/cub) for more information.
+
+
+
Contributors
+
+CUB is developed as an open-source project by [NVIDIA Research](http://research.nvidia.com). The primary contributor is [Duane Merrill](http://github.com/dumerrill).
+
+
+
Open Source License
+
+CUB is available under the "New BSD" open-source license:
+
+```
+Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
+Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the NVIDIA CORPORATION nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/3rdparty/cub-1.8.0/common.mk b/3rdparty/cub-1.8.0/common.mk
new file mode 100644
index 00000000..82893ab9
--- /dev/null
+++ b/3rdparty/cub-1.8.0/common.mk
@@ -0,0 +1,233 @@
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill. All rights reserved.
+# * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
+# *
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# * * Redistributions of source code must retain the above copyright
+# * notice, this list of conditions and the following disclaimer.
+# * * Redistributions in binary form must reproduce the above copyright
+# * notice, this list of conditions and the following disclaimer in the
+# * documentation and/or other materials provided with the distribution.
+# * * Neither the name of the NVIDIA CORPORATION nor the
+# * names of its contributors may be used to endorse or promote products
+# * derived from this software without specific prior written permission.
+# *
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+
+
+#-------------------------------------------------------------------------------
+# Commandline Options
+#-------------------------------------------------------------------------------
+
+# [sm=] Compute-capability to compile for, e.g., "sm=200,300,350" (SM20 by default).
+
+COMMA = ,
+ifdef sm
+ SM_ARCH = $(subst $(COMMA),-,$(sm))
+else
+ SM_ARCH = 200
+endif
+
+ifeq (700, $(findstring 700, $(SM_ARCH)))
+ SM_TARGETS += -gencode=arch=compute_70,code=\"sm_70,compute_70\"
+ SM_DEF += -DSM700
+ TEST_ARCH = 700
+endif
+ifeq (620, $(findstring 620, $(SM_ARCH)))
+ SM_TARGETS += -gencode=arch=compute_62,code=\"sm_62,compute_62\"
+ SM_DEF += -DSM620
+ TEST_ARCH = 620
+endif
+ifeq (610, $(findstring 610, $(SM_ARCH)))
+ SM_TARGETS += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
+ SM_DEF += -DSM610
+ TEST_ARCH = 610
+endif
+ifeq (600, $(findstring 600, $(SM_ARCH)))
+ SM_TARGETS += -gencode=arch=compute_60,code=\"sm_60,compute_60\"
+ SM_DEF += -DSM600
+ TEST_ARCH = 600
+endif
+ifeq (520, $(findstring 520, $(SM_ARCH)))
+ SM_TARGETS += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
+ SM_DEF += -DSM520
+ TEST_ARCH = 520
+endif
+ifeq (370, $(findstring 370, $(SM_ARCH)))
+ SM_TARGETS += -gencode=arch=compute_37,code=\"sm_37,compute_37\"
+ SM_DEF += -DSM370
+ TEST_ARCH = 370
+endif
+ifeq (350, $(findstring 350, $(SM_ARCH)))
+ SM_TARGETS += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
+ SM_DEF += -DSM350
+ TEST_ARCH = 350
+endif
+ifeq (300, $(findstring 300, $(SM_ARCH)))
+ SM_TARGETS += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
+ SM_DEF += -DSM300
+ TEST_ARCH = 300
+endif
+ifeq (210, $(findstring 210, $(SM_ARCH)))
+ SM_TARGETS += -gencode=arch=compute_20,code=\"sm_21,compute_20\"
+ SM_DEF += -DSM210
+ TEST_ARCH = 210
+endif
+ifeq (200, $(findstring 200, $(SM_ARCH)))
+ SM_TARGETS += -gencode=arch=compute_20,code=\"sm_20,compute_20\"
+ SM_DEF += -DSM200
+ TEST_ARCH = 200
+endif
+ifeq (130, $(findstring 130, $(SM_ARCH)))
+ SM_TARGETS += -gencode=arch=compute_13,code=\"sm_13,compute_13\"
+ SM_DEF += -DSM130
+ TEST_ARCH = 130
+endif
+ifeq (120, $(findstring 120, $(SM_ARCH)))
+ SM_TARGETS += -gencode=arch=compute_12,code=\"sm_12,compute_12\"
+ SM_DEF += -DSM120
+ TEST_ARCH = 120
+endif
+ifeq (110, $(findstring 110, $(SM_ARCH)))
+ SM_TARGETS += -gencode=arch=compute_11,code=\"sm_11,compute_11\"
+ SM_DEF += -DSM110
+ TEST_ARCH = 110
+endif
+ifeq (100, $(findstring 100, $(SM_ARCH)))
+ SM_TARGETS += -gencode=arch=compute_10,code=\"sm_10,compute_10\"
+ SM_DEF += -DSM100
+ TEST_ARCH = 100
+endif
+
+
+# [cdp=<0|1>] CDP enable option (default: no)
+ifeq ($(cdp), 1)
+ DEFINES += -DCUB_CDP
+ CDP_SUFFIX = cdp
+ NVCCFLAGS += -rdc=true -lcudadevrt
+else
+ CDP_SUFFIX = nocdp
+endif
+
+
+# [force32=<0|1>] Device addressing mode option (64-bit device pointers by default)
+ifeq ($(force32), 1)
+ CPU_ARCH = -m32
+ CPU_ARCH_SUFFIX = i386
+else
+ CPU_ARCH = -m64
+ CPU_ARCH_SUFFIX = x86_64
+ NPPI = -lnppist
+endif
+
+
+# [abi=<0|1>] CUDA ABI option (enabled by default)
+ifneq ($(abi), 0)
+ ABI_SUFFIX = abi
+else
+ NVCCFLAGS += -Xptxas -abi=no
+ ABI_SUFFIX = noabi
+endif
+
+
+# [open64=<0|1>] Middle-end compiler option (nvvm by default)
+ifeq ($(open64), 1)
+ NVCCFLAGS += -open64
+ PTX_SUFFIX = open64
+else
+ PTX_SUFFIX = nvvm
+endif
+
+
+# [verbose=<0|1>] Verbose toolchain output from nvcc option
+ifeq ($(verbose), 1)
+ NVCCFLAGS += -v
+endif
+
+
+# [keep=<0|1>] Keep intermediate compilation artifacts option
+ifeq ($(keep), 1)
+ NVCCFLAGS += -keep
+endif
+
+# [debug=<0|1>] Generate debug mode code
+ifeq ($(debug), 1)
+ NVCCFLAGS += -G
+endif
+
+
+#-------------------------------------------------------------------------------
+# Compiler and compilation platform
+#-------------------------------------------------------------------------------
+
+CUB_DIR = $(dir $(lastword $(MAKEFILE_LIST)))
+
+NVCC = "$(shell which nvcc)"
+ifdef nvccver
+ NVCC_VERSION = $(nvccver)
+else
+ NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' | sed 's/,.*//'))
+endif
+
+# detect OS
+OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+
+# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases
+NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\#
+
+ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
+ # For MSVC
+ # Enable more warnings and treat as errors
+ NVCCFLAGS += -Xcompiler /W3 -Xcompiler /WX
+ # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
+ NVCCFLAGS += -Xcompiler /fp:strict
+ # Help the compiler/linker work with huge numbers of kernels on Windows
+ NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
+ CC = cl
+
+ # Multithreaded runtime
+ NVCCFLAGS += -Xcompiler /MT
+
+ifneq ($(force32), 1)
+ CUDART_CYG = "$(shell dirname $(NVCC))/../lib/Win32/cudart.lib"
+else
+ CUDART_CYG = "$(shell dirname $(NVCC))/../lib/x64/cudart.lib"
+endif
+ CUDART = "$(shell cygpath -w $(CUDART_CYG))"
+else
+ # For g++
+ # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
+ NVCCFLAGS += -Xcompiler -ffloat-store
+ CC = g++
+ifneq ($(force32), 1)
+ CUDART = "$(shell dirname $(NVCC))/../lib/libcudart_static.a"
+else
+ CUDART = "$(shell dirname $(NVCC))/../lib64/libcudart_static.a"
+endif
+endif
+
+# Suffix to append to each binary
+BIN_SUFFIX = sm$(SM_ARCH)_$(PTX_SUFFIX)_$(NVCC_VERSION)_$(ABI_SUFFIX)_$(CDP_SUFFIX)_$(CPU_ARCH_SUFFIX)
+
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+CUB_DEPS = $(call rwildcard, $(CUB_DIR),*.cuh) \
+ $(CUB_DIR)common.mk
+
diff --git a/3rdparty/cub-1.8.0/cub/agent/agent_histogram.cuh b/3rdparty/cub-1.8.0/cub/agent/agent_histogram.cuh
new file mode 100644
index 00000000..37b1ec97
--- /dev/null
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_histogram.cuh
@@ -0,0 +1,787 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+
+#pragma once
+
+#include
+
+#include "../util_type.cuh"
+#include "../block/block_load.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ *
+ */
+enum BlockHistogramMemoryPreference
+{
+ GMEM,
+ SMEM,
+ BLEND
+};
+
+
+/**
+ * Parameterizable tuning policy type for AgentHistogram
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input)
+ BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
+ CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
+ bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming
+ BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+ bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue
+struct AgentHistogramPolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ PIXELS_PER_THREAD = _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input)
+ IS_RLE_COMPRESS = _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming
+ MEM_PREFERENCE = _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+ IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue
+ };
+
+ static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+template <
+ typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type
+ int PRIVATIZED_SMEM_BINS, ///< Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized counters to be maintained in device-accessible memory.
+ int NUM_CHANNELS, ///< Number of channels interleaved in the input data. Supports up to four channels.
+ int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename SampleIteratorT, ///< Random-access input iterator type for reading samples
+ typename CounterT, ///< Integer type for counting sample occurrences per histogram bin
+ typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+ typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+ typename OffsetT, ///< Signed integer type for global offsets
+ int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability
+struct AgentHistogram
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ /// The sample type of the input iterator
+ typedef typename std::iterator_traits::value_type SampleT;
+
+ /// The pixel type of SampleT
+ typedef typename CubVector::Type PixelT;
+
+ /// The quad type of SampleT
+ typedef typename CubVector::Type QuadT;
+
+ /// Constants
+ enum
+ {
+ BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS,
+
+ PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD,
+ SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
+ QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4,
+
+ TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS,
+ TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,
+
+ IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS,
+
+ MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ?
+ AgentHistogramPolicyT::MEM_PREFERENCE :
+ GMEM,
+
+ IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING,
+ };
+
+ /// Cache load modifier for reading input elements
+ static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
+
+
+ /// Input iterator wrapper type (for applying cache modifier)
+ typedef typename If::VALUE,
+ CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator
+ SampleIteratorT>::Type // Directly use the supplied input iterator type
+ WrappedSampleIteratorT;
+
+ /// Pixel input iterator type (for applying cache modifier)
+ typedef CacheModifiedInputIterator
+ WrappedPixelIteratorT;
+
+ /// Qaud input iterator type (for applying cache modifier)
+ typedef CacheModifiedInputIterator
+ WrappedQuadIteratorT;
+
+ /// Parameterized BlockLoad type for samples
+ typedef BlockLoad<
+ SampleT,
+ BLOCK_THREADS,
+ SAMPLES_PER_THREAD,
+ AgentHistogramPolicyT::LOAD_ALGORITHM>
+ BlockLoadSampleT;
+
+ /// Parameterized BlockLoad type for pixels
+ typedef BlockLoad<
+ PixelT,
+ BLOCK_THREADS,
+ PIXELS_PER_THREAD,
+ AgentHistogramPolicyT::LOAD_ALGORITHM>
+ BlockLoadPixelT;
+
+ /// Parameterized BlockLoad type for quads
+ typedef BlockLoad<
+ QuadT,
+ BLOCK_THREADS,
+ QUADS_PER_THREAD,
+ AgentHistogramPolicyT::LOAD_ALGORITHM>
+ BlockLoadQuadT;
+
+ /// Shared memory type required by this thread block
+ struct _TempStorage
+ {
+ CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; // Smem needed for block-privatized smem histogram (with 1 word of padding)
+
+ int tile_idx;
+
+ // Aliasable storage layout
+ union Aliasable
+ {
+ typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples
+ typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels
+ typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads
+
+ } aliasable;
+ };
+
+
+ /// Temporary storage type (unionable)
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ /// Reference to temp_storage
+ _TempStorage &temp_storage;
+
+ /// Sample input iterator (with cache modifier applied, if possible)
+ WrappedSampleIteratorT d_wrapped_samples;
+
+ /// Native pointer for input samples (possibly NULL if unavailable)
+ SampleT* d_native_samples;
+
+ /// The number of output bins for each channel
+ int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
+
+ /// The number of privatized bins for each channel
+ int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
+
+ /// Reference to gmem privatized histograms for each channel
+ CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+ /// Reference to final output histograms (gmem)
+ CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
+
+ /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+ OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
+
+ /// The transform operator for determining privatized counter indices from samples, one for each channel
+ PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
+
+ /// Whether to prefer privatized smem counters vs privatized global counters
+ bool prefer_smem;
+
+
+ //---------------------------------------------------------------------
+ // Initialize privatized bin counters
+ //---------------------------------------------------------------------
+
+ // Initialize privatized bin counters
+ __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+ {
+ // Initialize histogram bin counts to zeros
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+ {
+ for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
+ {
+ privatized_histograms[CHANNEL][privatized_bin] = 0;
+ }
+ }
+
+ // Barrier to make sure all threads are done updating counters
+ CTA_SYNC();
+ }
+
+
+ // Initialize privatized bin counters. Specialized for privatized shared-memory counters
+ __device__ __forceinline__ void InitSmemBinCounters()
+ {
+ CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+ for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+ privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+ InitBinCounters(privatized_histograms);
+ }
+
+
+ // Initialize privatized bin counters. Specialized for privatized global-memory counters
+ __device__ __forceinline__ void InitGmemBinCounters()
+ {
+ InitBinCounters(d_privatized_histograms);
+ }
+
+
+ //---------------------------------------------------------------------
+ // Update final output histograms
+ //---------------------------------------------------------------------
+
+ // Update final output histograms from privatized histograms
+ __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+ {
+ // Barrier to make sure all threads are done updating counters
+ CTA_SYNC();
+
+ // Apply privatized bin counts to output bin counts
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+ {
+ int channel_bins = num_privatized_bins[CHANNEL];
+ for (int privatized_bin = threadIdx.x;
+ privatized_bin < channel_bins;
+ privatized_bin += BLOCK_THREADS)
+ {
+ int output_bin = -1;
+ CounterT count = privatized_histograms[CHANNEL][privatized_bin];
+ bool is_valid = count > 0;
+
+ output_decode_op[CHANNEL].template BinSelect((SampleT) privatized_bin, output_bin, is_valid);
+
+ if (output_bin >= 0)
+ {
+ atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
+ }
+
+ }
+ }
+ }
+
+
+ // Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters
+ __device__ __forceinline__ void StoreSmemOutput()
+ {
+ CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+ for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+ privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+ StoreOutput(privatized_histograms);
+ }
+
+
+ // Update final output histograms from privatized histograms. Specialized for privatized global-memory counters
+ __device__ __forceinline__ void StoreGmemOutput()
+ {
+ StoreOutput(d_privatized_histograms);
+ }
+
+
+ //---------------------------------------------------------------------
+ // Tile accumulation
+ //---------------------------------------------------------------------
+
+ // Accumulate pixels. Specialized for RLE compression.
+ __device__ __forceinline__ void AccumulatePixels(
+ SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+ bool is_valid[PIXELS_PER_THREAD],
+ CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
+ Int2Type is_rle_compress)
+ {
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+ {
+ // Bin pixels
+ int bins[PIXELS_PER_THREAD];
+
+ #pragma unroll
+ for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+ {
+ bins[PIXEL] = -1;
+ privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
+ }
+
+ CounterT accumulator = 1;
+
+ #pragma unroll
+ for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
+ {
+ if (bins[PIXEL] != bins[PIXEL + 1])
+ {
+ if (bins[PIXEL] >= 0)
+ atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
+
+ accumulator = 0;
+ }
+ accumulator++;
+ }
+
+ // Last pixel
+ if (bins[PIXELS_PER_THREAD - 1] >= 0)
+ atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
+ }
+ }
+
+
+ // Accumulate pixels. Specialized for individual accumulation of each pixel.
+ __device__ __forceinline__ void AccumulatePixels(
+ SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+ bool is_valid[PIXELS_PER_THREAD],
+ CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
+ Int2Type is_rle_compress)
+ {
+ #pragma unroll
+ for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+ {
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+ {
+ int bin = -1;
+ privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
+ if (bin >= 0)
+ atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
+ }
+ }
+ }
+
+
+ /**
+ * Accumulate pixel, specialized for smem privatized histogram
+ */
+ __device__ __forceinline__ void AccumulateSmemPixels(
+ SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+ bool is_valid[PIXELS_PER_THREAD])
+ {
+ CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+ for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+ privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+ AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type());
+ }
+
+
+ /**
+ * Accumulate pixel, specialized for gmem privatized histogram
+ */
+ __device__ __forceinline__ void AccumulateGmemPixels(
+ SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+ bool is_valid[PIXELS_PER_THREAD])
+ {
+ AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type());
+ }
+
+
+
+ //---------------------------------------------------------------------
+ // Tile loading
+ //---------------------------------------------------------------------
+
+ // Load full, aligned tile using pixel iterator (multi-channel)
+ template
+ __device__ __forceinline__ void LoadFullAlignedTile(
+ OffsetT block_offset,
+ int valid_samples,
+ SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+ Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels)
+ {
+ typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+ WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+ // Load using a wrapped pixel iterator
+ BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+ d_wrapped_pixels,
+ reinterpret_cast(samples));
+ }
+
+ // Load full, aligned tile using quad iterator (single-channel)
+ __device__ __forceinline__ void LoadFullAlignedTile(
+ OffsetT block_offset,
+ int valid_samples,
+ SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+ Int2Type<1> num_active_channels)
+ {
+ typedef QuadT AliasedQuads[QUADS_PER_THREAD];
+
+ WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
+
+ // Load using a wrapped quad iterator
+ BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
+ d_wrapped_quads,
+ reinterpret_cast(samples));
+ }
+
+ // Load full, aligned tile
+ __device__ __forceinline__ void LoadTile(
+ OffsetT block_offset,
+ int valid_samples,
+ SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+ Int2Type is_full_tile,
+ Int2Type is_aligned)
+ {
+ LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type());
+ }
+
+ // Load full, mis-aligned tile using sample iterator
+ __device__ __forceinline__ void LoadTile(
+ OffsetT block_offset,
+ int valid_samples,
+ SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+ Int2Type is_full_tile,
+ Int2Type is_aligned)
+ {
+ typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+ // Load using sample iterator
+ BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+ d_wrapped_samples + block_offset,
+ reinterpret_cast(samples));
+ }
+
+ // Load partially-full, aligned tile using the pixel iterator
+ __device__ __forceinline__ void LoadTile(
+ OffsetT block_offset,
+ int valid_samples,
+ SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+ Int2Type is_full_tile,
+ Int2Type is_aligned)
+ {
+ typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+ WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+ int valid_pixels = valid_samples / NUM_CHANNELS;
+
+ // Load using a wrapped pixel iterator
+ BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+ d_wrapped_pixels,
+ reinterpret_cast(samples),
+ valid_pixels);
+ }
+
+ // Load partially-full, mis-aligned tile using sample iterator
+ __device__ __forceinline__ void LoadTile(
+ OffsetT block_offset,
+ int valid_samples,
+ SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+ Int2Type is_full_tile,
+ Int2Type is_aligned)
+ {
+ typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+ BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+ d_wrapped_samples + block_offset,
+ reinterpret_cast(samples),
+ valid_samples);
+ }
+
+
+ //---------------------------------------------------------------------
+ // Tile processing
+ //---------------------------------------------------------------------
+
+ // Consume a tile of data samples
+ template <
+ bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
+ bool IS_FULL_TILE> // Whether the tile is full
+ __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
+ {
+ SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS];
+ bool is_valid[PIXELS_PER_THREAD];
+
+ // Load tile
+ LoadTile(
+ block_offset,
+ valid_samples,
+ samples,
+ Int2Type(),
+ Int2Type());
+
+ // Set valid flags
+ #pragma unroll
+ for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+ is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
+
+ // Accumulate samples
+#if CUB_PTX_ARCH >= 120
+ if (prefer_smem)
+ AccumulateSmemPixels(samples, is_valid);
+ else
+ AccumulateGmemPixels(samples, is_valid);
+#else
+ AccumulateGmemPixels(samples, is_valid);
+#endif
+
+ }
+
+
+ // Consume row tiles. Specialized for work-stealing from queue
+ template
+ __device__ __forceinline__ void ConsumeTiles(
+ OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest
+ OffsetT num_rows, ///< The number of rows in the region of interest
+ OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest
+ int tiles_per_row, ///< Number of image tiles per row
+ GridQueue tile_queue,
+ Int2Type is_work_stealing)
+ {
+
+ int num_tiles = num_rows * tiles_per_row;
+ int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x;
+ OffsetT num_even_share_tiles = gridDim.x * gridDim.y;
+
+ while (tile_idx < num_tiles)
+ {
+ int row = tile_idx / tiles_per_row;
+ int col = tile_idx - (row * tiles_per_row);
+ OffsetT row_offset = row * row_stride_samples;
+ OffsetT col_offset = (col * TILE_SAMPLES);
+ OffsetT tile_offset = row_offset + col_offset;
+
+ if (col == tiles_per_row - 1)
+ {
+ // Consume a partially-full tile at the end of the row
+ OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
+ ConsumeTile(tile_offset, num_remaining);
+ }
+ else
+ {
+ // Consume full tile
+ ConsumeTile(tile_offset, TILE_SAMPLES);
+ }
+
+ CTA_SYNC();
+
+ // Get next tile
+ if (threadIdx.x == 0)
+ temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
+
+ CTA_SYNC();
+
+ tile_idx = temp_storage.tile_idx;
+ }
+ }
+
+
+ // Consume row tiles. Specialized for even-share (striped across thread blocks)
+ template
+ __device__ __forceinline__ void ConsumeTiles(
+ OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest
+ OffsetT num_rows, ///< The number of rows in the region of interest
+ OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest
+ int tiles_per_row, ///< Number of image tiles per row
+ GridQueue tile_queue,
+ Int2Type is_work_stealing)
+ {
+ for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
+ {
+ OffsetT row_begin = row * row_stride_samples;
+ OffsetT row_end = row_begin + (num_row_pixels * NUM_CHANNELS);
+ OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
+
+ while (tile_offset < row_end)
+ {
+ OffsetT num_remaining = row_end - tile_offset;
+
+ if (num_remaining < TILE_SAMPLES)
+ {
+ // Consume partial tile
+ ConsumeTile(tile_offset, num_remaining);
+ break;
+ }
+
+ // Consume full tile
+ ConsumeTile(tile_offset, TILE_SAMPLES);
+ tile_offset += gridDim.x * TILE_SAMPLES;
+ }
+ }
+ }
+
+
+ //---------------------------------------------------------------------
+ // Parameter extraction
+ //---------------------------------------------------------------------
+
+ // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
+ template <
+ CacheLoadModifier _MODIFIER,
+ typename _ValueT,
+ typename _OffsetT>
+ __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
+ {
+ return itr.ptr;
+ }
+
+ // Return a native pixel pointer (specialized for other types)
+ template
+ __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
+ {
+ return NULL;
+ }
+
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ AgentHistogram(
+ TempStorage &temp_storage, ///< Reference to temp_storage
+ SampleIteratorT d_samples, ///< Input data to reduce
+ int (&num_output_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per final output histogram
+ int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per privatized histogram
+ CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to final output histograms
+ CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to privatized histograms
+ OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS], ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+ PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) ///< The transform operator for determining privatized counter indices from samples, one for each channel
+ :
+ temp_storage(temp_storage.Alias()),
+ d_wrapped_samples(d_samples),
+ num_output_bins(num_output_bins),
+ num_privatized_bins(num_privatized_bins),
+ d_output_histograms(d_output_histograms),
+ privatized_decode_op(privatized_decode_op),
+ output_decode_op(output_decode_op),
+ d_native_samples(NativePointer(d_wrapped_samples)),
+ prefer_smem((MEM_PREFERENCE == SMEM) ?
+ true : // prefer smem privatized histograms
+ (MEM_PREFERENCE == GMEM) ?
+ false : // prefer gmem privatized histograms
+ blockIdx.x & 1) // prefer blended privatized histograms
+ {
+ int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
+
+ // Initialize the locations of this block's privatized histograms
+ for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+ this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
+ }
+
+
+ /**
+ * Consume image
+ */
+ __device__ __forceinline__ void ConsumeTiles(
+ OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest
+ OffsetT num_rows, ///< The number of rows in the region of interest
+ OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest
+ int tiles_per_row, ///< Number of image tiles per row
+ GridQueue tile_queue) ///< Queue descriptor for assigning tiles of work to thread blocks
+ {
+ // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
+ int quad_mask = AlignBytes::ALIGN_BYTES - 1;
+ int pixel_mask = AlignBytes::ALIGN_BYTES - 1;
+ size_t row_bytes = sizeof(SampleT) * row_stride_samples;
+
+ bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) && // Single channel
+ ((size_t(d_native_samples) & quad_mask) == 0) && // ptr is quad-aligned
+ ((num_rows == 1) || ((row_bytes & quad_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad
+
+ bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel
+ ((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned
+ ((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel
+
+ // Whether rows are aligned and can be vectorized
+ if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
+ ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type());
+ else
+ ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type());
+ }
+
+
+ /**
+ * Initialize privatized bin counters. Specialized for privatized shared-memory counters
+ */
+ __device__ __forceinline__ void InitBinCounters()
+ {
+ if (prefer_smem)
+ InitSmemBinCounters();
+ else
+ InitGmemBinCounters();
+ }
+
+
+ /**
+ * Store privatized histogram to device-accessible memory. Specialized for privatized shared-memory counters
+ */
+ __device__ __forceinline__ void StoreOutput()
+ {
+ if (prefer_smem)
+ StoreSmemOutput();
+ else
+ StoreGmemOutput();
+ }
+
+
+};
+
+
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/3rdparty/cub-1.8.0/cub/agent/agent_radix_sort_downsweep.cuh b/3rdparty/cub-1.8.0/cub/agent/agent_radix_sort_downsweep.cuh
new file mode 100644
index 00000000..faea8813
--- /dev/null
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_radix_sort_downsweep.cuh
@@ -0,0 +1,789 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+
+
+#pragma once
+
+#include
+
+#include "../thread/thread_load.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_radix_rank.cuh"
+#include "../block/block_exchange.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Radix ranking algorithm
+ */
+enum RadixRankAlgorithm
+{
+ RADIX_RANK_BASIC,
+ RADIX_RANK_MEMOIZE,
+ RADIX_RANK_MATCH
+};
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortDownsweep
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
+ CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys (and values)
+ RadixRankAlgorithm _RANK_ALGORITHM, ///< The radix ranking algorithm to use
+ BlockScanAlgorithm _SCAN_ALGORITHM, ///< The block scan algorithm to use
+ int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins)
+struct AgentRadixSortDownsweepPolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins)
+ };
+
+ static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys (and values)
+ static const RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM; ///< The radix ranking algorithm to use
+ static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+
+
+
+/**
+ * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+template <
+ typename AgentRadixSortDownsweepPolicy, ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
+ bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low
+ typename KeyT, ///< KeyT type
+ typename ValueT, ///< ValueT type
+ typename OffsetT> ///< Signed integer type for global offsets
+struct AgentRadixSortDownsweep
+{
+ //---------------------------------------------------------------------
+ // Type definitions and constants
+ //---------------------------------------------------------------------
+
+ // Appropriate unsigned-bits representation of KeyT
+ typedef typename Traits::UnsignedBits UnsignedBits;
+
+ static const UnsignedBits LOWEST_KEY = Traits::LOWEST_KEY;
+ static const UnsignedBits MAX_KEY = Traits::MAX_KEY;
+
+ static const BlockLoadAlgorithm LOAD_ALGORITHM = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM;
+ static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER;
+ static const RadixRankAlgorithm RANK_ALGORITHM = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM;
+ static const BlockScanAlgorithm SCAN_ALGORITHM = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM;
+
+ enum
+ {
+ BLOCK_THREADS = AgentRadixSortDownsweepPolicy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
+ RADIX_BITS = AgentRadixSortDownsweepPolicy::RADIX_BITS,
+ TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+ RADIX_DIGITS = 1 << RADIX_BITS,
+ KEYS_ONLY = Equals::VALUE,
+ };
+
+ // Input iterator wrapper type (for applying cache modifier)s
+ typedef CacheModifiedInputIterator KeysItr;
+ typedef CacheModifiedInputIterator ValuesItr;
+
+ // Radix ranking type to use
+ typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC),
+ BlockRadixRank,
+ typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+ BlockRadixRank,
+ BlockRadixRankMatch
+ >::Type
+ >::Type BlockRadixRankT;
+
+ enum
+ {
+ /// Number of bin-starting offsets tracked per thread
+ BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD
+ };
+
+ // BlockLoad type (keys)
+ typedef BlockLoad<
+ UnsignedBits,
+ BLOCK_THREADS,
+ ITEMS_PER_THREAD,
+ LOAD_ALGORITHM> BlockLoadKeysT;
+
+ // BlockLoad type (values)
+ typedef BlockLoad<
+ ValueT,
+ BLOCK_THREADS,
+ ITEMS_PER_THREAD,
+ LOAD_ALGORITHM> BlockLoadValuesT;
+
+ // Value exchange array type
+ typedef ValueT ValueExchangeT[TILE_ITEMS];
+
+ /**
+ * Shared memory storage layout
+ */
+ union __align__(16) _TempStorage
+ {
+ typename BlockLoadKeysT::TempStorage load_keys;
+ typename BlockLoadValuesT::TempStorage load_values;
+ typename BlockRadixRankT::TempStorage radix_rank;
+
+ struct
+ {
+ UnsignedBits exchange_keys[TILE_ITEMS];
+ OffsetT relative_bin_offsets[RADIX_DIGITS];
+ };
+
+ Uninitialized exchange_values;
+
+ OffsetT exclusive_digit_prefix[RADIX_DIGITS];
+ };
+
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Thread fields
+ //---------------------------------------------------------------------
+
+ // Shared storage for this CTA
+ _TempStorage &temp_storage;
+
+ // Input and output device pointers
+ KeysItr d_keys_in;
+ ValuesItr d_values_in;
+ UnsignedBits *d_keys_out;
+ ValueT *d_values_out;
+
+ // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
+ OffsetT bin_offset[BINS_TRACKED_PER_THREAD];
+
+ // The least-significant bit position of the current digit to extract
+ int current_bit;
+
+ // Number of bits in current digit
+ int num_bits;
+
+ // Whether to short-cirucit
+ int short_circuit;
+
+ //---------------------------------------------------------------------
+ // Utility methods
+ //---------------------------------------------------------------------
+
+
+ /**
+ * Scatter ranked keys through shared memory, then to device-accessible memory
+ */
+ template
+ __device__ __forceinline__ void ScatterKeys(
+ UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD],
+ OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD],
+ int (&ranks)[ITEMS_PER_THREAD],
+ OffsetT valid_items)
+ {
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
+ }
+
+ CTA_SYNC();
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ UnsignedBits key = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
+ UnsignedBits digit = BFE(key, current_bit, num_bits);
+ relative_bin_offsets[ITEM] = temp_storage.relative_bin_offsets[digit];
+
+ // Un-twiddle
+ key = Traits::TwiddleOut(key);
+
+ if (FULL_TILE ||
+ (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+ {
+ d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
+ }
+ }
+ }
+
+
+ /**
+ * Scatter ranked values through shared memory, then to device-accessible memory
+ */
+ template
+ __device__ __forceinline__ void ScatterValues(
+ ValueT (&values)[ITEMS_PER_THREAD],
+ OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD],
+ int (&ranks)[ITEMS_PER_THREAD],
+ OffsetT valid_items)
+ {
+ CTA_SYNC();
+
+ ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ exchange_values[ranks[ITEM]] = values[ITEM];
+ }
+
+ CTA_SYNC();
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
+
+ if (FULL_TILE ||
+ (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+ {
+ d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
+ }
+ }
+ }
+
+ /**
+ * Load a tile of keys (specialized for full tile, any ranking algorithm)
+ */
+ template
+ __device__ __forceinline__ void LoadKeys(
+ UnsignedBits (&keys)[ITEMS_PER_THREAD],
+ OffsetT block_offset,
+ OffsetT valid_items,
+ UnsignedBits oob_item,
+ Int2Type is_full_tile,
+ Int2Type<_RANK_ALGORITHM> rank_algorithm)
+ {
+ BlockLoadKeysT(temp_storage.load_keys).Load(
+ d_keys_in + block_offset, keys);
+
+ CTA_SYNC();
+ }
+
+
+ /**
+ * Load a tile of keys (specialized for partial tile, any ranking algorithm)
+ */
+ template
+ __device__ __forceinline__ void LoadKeys(
+ UnsignedBits (&keys)[ITEMS_PER_THREAD],
+ OffsetT block_offset,
+ OffsetT valid_items,
+ UnsignedBits oob_item,
+ Int2Type is_full_tile,
+ Int2Type<_RANK_ALGORITHM> rank_algorithm)
+ {
+ // Register pressure work-around: moving valid_items through shfl prevents compiler
+ // from reusing guards/addressing from prior guarded loads
+ valid_items = ShuffleIndex(valid_items, 0, 0xffffffff);
+
+ BlockLoadKeysT(temp_storage.load_keys).Load(
+ d_keys_in + block_offset, keys, valid_items, oob_item);
+
+ CTA_SYNC();
+ }
+
+
+ /**
+ * Load a tile of keys (specialized for full tile, match ranking algorithm)
+ */
+ __device__ __forceinline__ void LoadKeys(
+ UnsignedBits (&keys)[ITEMS_PER_THREAD],
+ OffsetT block_offset,
+ OffsetT valid_items,
+ UnsignedBits oob_item,
+ Int2Type is_full_tile,
+ Int2Type rank_algorithm)
+ {
+ LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
+ }
+
+
+ /**
+ * Load a tile of keys (specialized for partial tile, match ranking algorithm)
+ */
+ __device__ __forceinline__ void LoadKeys(
+ UnsignedBits (&keys)[ITEMS_PER_THREAD],
+ OffsetT block_offset,
+ OffsetT valid_items,
+ UnsignedBits oob_item,
+ Int2Type is_full_tile,
+ Int2Type rank_algorithm)
+ {
+ // Register pressure work-around: moving valid_items through shfl prevents compiler
+ // from reusing guards/addressing from prior guarded loads
+ valid_items = ShuffleIndex(valid_items, 0, 0xffffffff);
+
+ LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
+ }
+
+
+ /**
+ * Load a tile of values (specialized for full tile, any ranking algorithm)
+ */
+ template
+ __device__ __forceinline__ void LoadValues(
+ ValueT (&values)[ITEMS_PER_THREAD],
+ OffsetT block_offset,
+ OffsetT valid_items,
+ Int2Type is_full_tile,
+ Int2Type<_RANK_ALGORITHM> rank_algorithm)
+ {
+ BlockLoadValuesT(temp_storage.load_values).Load(
+ d_values_in + block_offset, values);
+
+ CTA_SYNC();
+ }
+
+
+ /**
+ * Load a tile of values (specialized for partial tile, any ranking algorithm)
+ */
+ template
+ __device__ __forceinline__ void LoadValues(
+ ValueT (&values)[ITEMS_PER_THREAD],
+ OffsetT block_offset,
+ OffsetT valid_items,
+ Int2Type is_full_tile,
+ Int2Type<_RANK_ALGORITHM> rank_algorithm)
+ {
+ // Register pressure work-around: moving valid_items through shfl prevents compiler
+ // from reusing guards/addressing from prior guarded loads
+ valid_items = ShuffleIndex(valid_items, 0, 0xffffffff);
+
+ BlockLoadValuesT(temp_storage.load_values).Load(
+ d_values_in + block_offset, values, valid_items);
+
+ CTA_SYNC();
+ }
+
+
+ /**
+ * Load a tile of items (specialized for full tile, match ranking algorithm)
+ */
+ __device__ __forceinline__ void LoadValues(
+ ValueT (&values)[ITEMS_PER_THREAD],
+ OffsetT block_offset,
+ OffsetT valid_items,
+ Int2Type is_full_tile,
+ Int2Type rank_algorithm)
+ {
+ LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
+ }
+
+
+ /**
+ * Load a tile of items (specialized for partial tile, match ranking algorithm)
+ */
+ __device__ __forceinline__ void LoadValues(
+ ValueT (&values)[ITEMS_PER_THREAD],
+ OffsetT block_offset,
+ OffsetT valid_items,
+ Int2Type is_full_tile,
+ Int2Type rank_algorithm)
+ {
+ // Register pressure work-around: moving valid_items through shfl prevents compiler
+ // from reusing guards/addressing from prior guarded loads
+ valid_items = ShuffleIndex(valid_items, 0, 0xffffffff);
+
+ LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
+ }
+
+
+ /**
+ * Truck along associated values
+ */
+ template
+ __device__ __forceinline__ void GatherScatterValues(
+ OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD],
+ int (&ranks)[ITEMS_PER_THREAD],
+ OffsetT block_offset,
+ OffsetT valid_items,
+ Int2Type /*is_keys_only*/)
+ {
+ ValueT values[ITEMS_PER_THREAD];
+
+ CTA_SYNC();
+
+ LoadValues(
+ values,
+ block_offset,
+ valid_items,
+ Int2Type(),
+ Int2Type());
+
+ ScatterValues(
+ values,
+ relative_bin_offsets,
+ ranks,
+ valid_items);
+ }
+
+
+ /**
+ * Truck along associated values (specialized for key-only sorting)
+ */
+ template
+ __device__ __forceinline__ void GatherScatterValues(
+ OffsetT (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD],
+ int (&/*ranks*/)[ITEMS_PER_THREAD],
+ OffsetT /*block_offset*/,
+ OffsetT /*valid_items*/,
+ Int2Type /*is_keys_only*/)
+ {}
+
+
+ /**
+ * Process tile
+ */
+ template
+ __device__ __forceinline__ void ProcessTile(
+ OffsetT block_offset,
+ const OffsetT &valid_items = TILE_ITEMS)
+ {
+ UnsignedBits keys[ITEMS_PER_THREAD];
+ int ranks[ITEMS_PER_THREAD];
+ OffsetT relative_bin_offsets[ITEMS_PER_THREAD];
+
+ // Assign default (min/max) value to all keys
+ UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
+
+ // Load tile of keys
+ LoadKeys(
+ keys,
+ block_offset,
+ valid_items,
+ default_key,
+ Int2Type(),
+ Int2Type());
+
+ // Twiddle key bits if necessary
+ #pragma unroll
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+ {
+ keys[KEY] = Traits::TwiddleIn(keys[KEY]);
+ }
+
+ // Rank the twiddled keys
+ int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+ BlockRadixRankT(temp_storage.radix_rank).RankKeys(
+ keys,
+ ranks,
+ current_bit,
+ num_bits,
+ exclusive_digit_prefix);
+
+ CTA_SYNC();
+
+ // Share exclusive digit prefix
+ #pragma unroll
+ for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+ {
+ int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+ if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+ {
+ // Store exclusive prefix
+ temp_storage.exclusive_digit_prefix[bin_idx] =
+ exclusive_digit_prefix[track];
+ }
+ }
+
+ CTA_SYNC();
+
+ // Get inclusive digit prefix
+ int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+
+ #pragma unroll
+ for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+ {
+ int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+ if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+ {
+ if (IS_DESCENDING)
+ {
+ // Get inclusive digit prefix from exclusive prefix (higher bins come first)
+ inclusive_digit_prefix[track] = (bin_idx == 0) ?
+ (BLOCK_THREADS * ITEMS_PER_THREAD) :
+ temp_storage.exclusive_digit_prefix[bin_idx - 1];
+ }
+ else
+ {
+ // Get inclusive digit prefix from exclusive prefix (lower bins come first)
+ inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ?
+ (BLOCK_THREADS * ITEMS_PER_THREAD) :
+ temp_storage.exclusive_digit_prefix[bin_idx + 1];
+ }
+ }
+ }
+
+ CTA_SYNC();
+
+ // Update global scatter base offsets for each digit
+ #pragma unroll
+ for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+ {
+ int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+ if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+ {
+ bin_offset[track] -= exclusive_digit_prefix[track];
+ temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track];
+ bin_offset[track] += inclusive_digit_prefix[track];
+ }
+ }
+
+ CTA_SYNC();
+
+ // Scatter keys
+ ScatterKeys(keys, relative_bin_offsets, ranks, valid_items);
+
+ // Gather/scatter values
+ GatherScatterValues(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type());
+ }
+
+ //---------------------------------------------------------------------
+ // Copy shortcut
+ //---------------------------------------------------------------------
+
+ /**
+ * Copy tiles within the range of input
+ */
+ template <
+ typename InputIteratorT,
+ typename T>
+ __device__ __forceinline__ void Copy(
+ InputIteratorT d_in,
+ T *d_out,
+ OffsetT block_offset,
+ OffsetT block_end)
+ {
+ // Simply copy the input
+ while (block_offset + TILE_ITEMS <= block_end)
+ {
+ T items[ITEMS_PER_THREAD];
+
+ LoadDirectStriped(threadIdx.x, d_in + block_offset, items);
+ CTA_SYNC();
+ StoreDirectStriped(threadIdx.x, d_out + block_offset, items);
+
+ block_offset += TILE_ITEMS;
+ }
+
+ // Clean up last partial tile with guarded-I/O
+ if (block_offset < block_end)
+ {
+ OffsetT valid_items = block_end - block_offset;
+
+ T items[ITEMS_PER_THREAD];
+
+ LoadDirectStriped(threadIdx.x, d_in + block_offset, items, valid_items);
+ CTA_SYNC();
+ StoreDirectStriped(threadIdx.x, d_out + block_offset, items, valid_items);
+ }
+ }
+
+
+ /**
+ * Copy tiles within the range of input (specialized for NullType)
+ */
+ template
+ __device__ __forceinline__ void Copy(
+ InputIteratorT /*d_in*/,
+ NullType * /*d_out*/,
+ OffsetT /*block_offset*/,
+ OffsetT /*block_end*/)
+ {}
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ AgentRadixSortDownsweep(
+ TempStorage &temp_storage,
+ OffsetT (&bin_offset)[BINS_TRACKED_PER_THREAD],
+ OffsetT num_items,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ int current_bit,
+ int num_bits)
+ :
+ temp_storage(temp_storage.Alias()),
+ d_keys_in(reinterpret_cast(d_keys_in)),
+ d_values_in(d_values_in),
+ d_keys_out(reinterpret_cast(d_keys_out)),
+ d_values_out(d_values_out),
+ current_bit(current_bit),
+ num_bits(num_bits),
+ short_circuit(1)
+ {
+ #pragma unroll
+ for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+ {
+ this->bin_offset[track] = bin_offset[track];
+
+ int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+ if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+ {
+ // Short circuit if the histogram has only bin counts of only zeros or problem-size
+ short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items));
+ }
+ }
+
+ short_circuit = CTA_SYNC_AND(short_circuit);
+ }
+
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ AgentRadixSortDownsweep(
+ TempStorage &temp_storage,
+ OffsetT num_items,
+ OffsetT *d_spine,
+ const KeyT *d_keys_in,
+ KeyT *d_keys_out,
+ const ValueT *d_values_in,
+ ValueT *d_values_out,
+ int current_bit,
+ int num_bits)
+ :
+ temp_storage(temp_storage.Alias()),
+ d_keys_in(reinterpret_cast(d_keys_in)),
+ d_values_in(d_values_in),
+ d_keys_out(reinterpret_cast(d_keys_out)),
+ d_values_out(d_values_out),
+ current_bit(current_bit),
+ num_bits(num_bits),
+ short_circuit(1)
+ {
+ #pragma unroll
+ for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+ {
+ int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+ // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
+ if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+ {
+ if (IS_DESCENDING)
+ bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+ // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
+ OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx];
+ short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
+
+ // Load my block's bin offset for my bin
+ bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
+ }
+ }
+
+ short_circuit = CTA_SYNC_AND(short_circuit);
+ }
+
+
+ /**
+ * Distribute keys from a segment of input tiles.
+ */
+ __device__ __forceinline__ void ProcessRegion(
+ OffsetT block_offset,
+ OffsetT block_end)
+ {
+ if (short_circuit)
+ {
+ // Copy keys
+ Copy(d_keys_in, d_keys_out, block_offset, block_end);
+
+ // Copy values
+ Copy(d_values_in, d_values_out, block_offset, block_end);
+ }
+ else
+ {
+ // Process full tiles of tile_items
+ #pragma unroll 1
+ while (block_offset + TILE_ITEMS <= block_end)
+ {
+ ProcessTile(block_offset);
+ block_offset += TILE_ITEMS;
+
+ CTA_SYNC();
+ }
+
+ // Clean up last partial tile with guarded-I/O
+ if (block_offset < block_end)
+ {
+ ProcessTile(block_offset, block_end - block_offset);
+ }
+
+ }
+ }
+
+};
+
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/3rdparty/cub-1.8.0/cub/agent/agent_radix_sort_upsweep.cuh b/3rdparty/cub-1.8.0/cub/agent/agent_radix_sort_upsweep.cuh
new file mode 100644
index 00000000..2081cefb
--- /dev/null
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_radix_sort_upsweep.cuh
@@ -0,0 +1,526 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+
+#pragma once
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_load.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../block/block_load.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortUpsweep
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys
+ int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins)
+struct AgentRadixSortUpsweepPolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins)
+ };
+
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+template <
+ typename AgentRadixSortUpsweepPolicy, ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
+ typename KeyT, ///< KeyT type
+ typename OffsetT> ///< Signed integer type for global offsets
+struct AgentRadixSortUpsweep
+{
+
+ //---------------------------------------------------------------------
+ // Type definitions and constants
+ //---------------------------------------------------------------------
+
+ typedef typename Traits::UnsignedBits UnsignedBits;
+
+ // Integer type for digit counters (to be packed into words of PackedCounters)
+ typedef unsigned char DigitCounter;
+
+ // Integer type for packing DigitCounters into columns of shared memory banks
+ typedef unsigned int PackedCounter;
+
+ static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
+
+ enum
+ {
+ RADIX_BITS = AgentRadixSortUpsweepPolicy::RADIX_BITS,
+ BLOCK_THREADS = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
+ KEYS_PER_THREAD = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
+
+ RADIX_DIGITS = 1 << RADIX_BITS,
+
+ LOG_WARP_THREADS = CUB_PTX_LOG_WARP_THREADS,
+ WARP_THREADS = 1 << LOG_WARP_THREADS,
+ WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+ TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD,
+
+ BYTES_PER_COUNTER = sizeof(DigitCounter),
+ LOG_BYTES_PER_COUNTER = Log2::VALUE,
+
+ PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter),
+ LOG_PACKING_RATIO = Log2::VALUE,
+
+ LOG_COUNTER_LANES = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
+ COUNTER_LANES = 1 << LOG_COUNTER_LANES,
+
+ // To prevent counter overflow, we must periodically unpack and aggregate the
+ // digit counters back into registers. Each counter lane is assigned to a
+ // warp for aggregation.
+
+ LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+
+ // Unroll tiles in batches without risk of counter overflow
+ UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+ UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS,
+ };
+
+
+ // Input iterator wrapper type (for applying cache modifier)s
+ typedef CacheModifiedInputIterator KeysItr;
+
+ /**
+ * Shared memory storage layout
+ */
+ union __align__(16) _TempStorage
+ {
+ DigitCounter thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+ PackedCounter packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
+ OffsetT block_counters[WARP_THREADS][RADIX_DIGITS];
+ };
+
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Thread fields (aggregate state bundle)
+ //---------------------------------------------------------------------
+
+ // Shared storage for this CTA
+ _TempStorage &temp_storage;
+
+ // Thread-local counters for periodically aggregating composite-counter lanes
+ OffsetT local_counts[LANES_PER_WARP][PACKING_RATIO];
+
+ // Input and output device pointers
+ KeysItr d_keys_in;
+
+ // The least-significant bit position of the current digit to extract
+ int current_bit;
+
+ // Number of bits in current digit
+ int num_bits;
+
+
+
+ //---------------------------------------------------------------------
+ // Helper structure for templated iteration
+ //---------------------------------------------------------------------
+
+ // Iterate
+ template
+ struct Iterate
+ {
+ // BucketKeys
+ static __device__ __forceinline__ void BucketKeys(
+ AgentRadixSortUpsweep &cta,
+ UnsignedBits keys[KEYS_PER_THREAD])
+ {
+ cta.Bucket(keys[COUNT]);
+
+ // Next
+ Iterate::BucketKeys(cta, keys);
+ }
+ };
+
+ // Terminate
+ template
+ struct Iterate
+ {
+ // BucketKeys
+ static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {}
+ };
+
+
+ //---------------------------------------------------------------------
+ // Utility methods
+ //---------------------------------------------------------------------
+
+ /**
+ * Decode a key and increment corresponding smem digit counter
+ */
+ __device__ __forceinline__ void Bucket(UnsignedBits key)
+ {
+ // Perform transform op
+ UnsignedBits converted_key = Traits::TwiddleIn(key);
+
+ // Extract current digit bits
+ UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
+
+ // Get sub-counter offset
+ UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
+
+ // Get row offset
+ UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
+
+ // Increment counter
+ temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
+ }
+
+
+ /**
+ * Reset composite counters
+ */
+ __device__ __forceinline__ void ResetDigitCounters()
+ {
+ #pragma unroll
+ for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
+ {
+ temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
+ }
+ }
+
+
+ /**
+ * Reset the unpacked counters in each thread
+ */
+ __device__ __forceinline__ void ResetUnpackedCounters()
+ {
+ #pragma unroll
+ for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+ {
+ #pragma unroll
+ for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+ {
+ local_counts[LANE][UNPACKED_COUNTER] = 0;
+ }
+ }
+ }
+
+
+ /**
+ * Extracts and aggregates the digit counters for each counter lane
+ * owned by this warp
+ */
+ __device__ __forceinline__ void UnpackDigitCounts()
+ {
+ unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+ unsigned int warp_tid = LaneId();
+
+ #pragma unroll
+ for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+ {
+ const int counter_lane = (LANE * WARPS) + warp_id;
+ if (counter_lane < COUNTER_LANES)
+ {
+ #pragma unroll
+ for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
+ {
+ #pragma unroll
+ for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+ {
+ OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+ local_counts[LANE][UNPACKED_COUNTER] += counter;
+ }
+ }
+ }
+ }
+ }
+
+
+ /**
+ * Processes a single, full tile
+ */
+ __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset)
+ {
+ // Tile of keys
+ UnsignedBits keys[KEYS_PER_THREAD];
+
+ LoadDirectStriped(threadIdx.x, d_keys_in + block_offset, keys);
+
+ // Prevent hoisting
+ CTA_SYNC();
+
+ // Bucket tile of keys
+ Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
+ }
+
+
+ /**
+ * Processes a single load (may have some threads masked off)
+ */
+ __device__ __forceinline__ void ProcessPartialTile(
+ OffsetT block_offset,
+ const OffsetT &block_end)
+ {
+ // Process partial tile if necessary using single loads
+ block_offset += threadIdx.x;
+ while (block_offset < block_end)
+ {
+ // Load and bucket key
+ UnsignedBits key = d_keys_in[block_offset];
+ Bucket(key);
+ block_offset += BLOCK_THREADS;
+ }
+ }
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ AgentRadixSortUpsweep(
+ TempStorage &temp_storage,
+ const KeyT *d_keys_in,
+ int current_bit,
+ int num_bits)
+ :
+ temp_storage(temp_storage.Alias()),
+ d_keys_in(reinterpret_cast(d_keys_in)),
+ current_bit(current_bit),
+ num_bits(num_bits)
+ {}
+
+
+ /**
+ * Compute radix digit histograms from a segment of input tiles.
+ */
+ __device__ __forceinline__ void ProcessRegion(
+ OffsetT block_offset,
+ const OffsetT &block_end)
+ {
+ // Reset digit counters in smem and unpacked counters in registers
+ ResetDigitCounters();
+ ResetUnpackedCounters();
+
+ // Unroll batches of full tiles
+ while (block_offset + UNROLLED_ELEMENTS <= block_end)
+ {
+ for (int i = 0; i < UNROLL_COUNT; ++i)
+ {
+ ProcessFullTile(block_offset);
+ block_offset += TILE_ITEMS;
+ }
+
+ CTA_SYNC();
+
+ // Aggregate back into local_count registers to prevent overflow
+ UnpackDigitCounts();
+
+ CTA_SYNC();
+
+ // Reset composite counters in lanes
+ ResetDigitCounters();
+ }
+
+ // Unroll single full tiles
+ while (block_offset + TILE_ITEMS <= block_end)
+ {
+ ProcessFullTile(block_offset);
+ block_offset += TILE_ITEMS;
+ }
+
+ // Process partial tile if necessary
+ ProcessPartialTile(
+ block_offset,
+ block_end);
+
+ CTA_SYNC();
+
+ // Aggregate back into local_count registers
+ UnpackDigitCounts();
+ }
+
+
+ /**
+ * Extract counts (saving them to the external array)
+ */
+ template
+ __device__ __forceinline__ void ExtractCounts(
+ OffsetT *counters,
+ int bin_stride = 1,
+ int bin_offset = 0)
+ {
+ unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+ unsigned int warp_tid = LaneId();
+
+ // Place unpacked digit counters in shared memory
+ #pragma unroll
+ for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+ {
+ int counter_lane = (LANE * WARPS) + warp_id;
+ if (counter_lane < COUNTER_LANES)
+ {
+ int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+ #pragma unroll
+ for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+ {
+ int bin_idx = digit_row + UNPACKED_COUNTER;
+
+ temp_storage.block_counters[warp_tid][bin_idx] =
+ local_counts[LANE][UNPACKED_COUNTER];
+ }
+ }
+ }
+
+ CTA_SYNC();
+
+ // Rake-reduce bin_count reductions
+
+ // Whole blocks
+ #pragma unroll
+ for (int BIN_BASE = RADIX_DIGITS % BLOCK_THREADS;
+ (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
+ BIN_BASE += BLOCK_THREADS)
+ {
+ int bin_idx = BIN_BASE + threadIdx.x;
+
+ OffsetT bin_count = 0;
+ #pragma unroll
+ for (int i = 0; i < WARP_THREADS; ++i)
+ bin_count += temp_storage.block_counters[i][bin_idx];
+
+ if (IS_DESCENDING)
+ bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+ counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+ }
+
+ // Remainder
+ if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
+ {
+ int bin_idx = threadIdx.x;
+
+ OffsetT bin_count = 0;
+ #pragma unroll
+ for (int i = 0; i < WARP_THREADS; ++i)
+ bin_count += temp_storage.block_counters[i][bin_idx];
+
+ if (IS_DESCENDING)
+ bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+ counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+ }
+ }
+
+
+ /**
+ * Extract counts
+ */
+ template
+ __device__ __forceinline__ void ExtractCounts(
+ OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+ {
+ unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+ unsigned int warp_tid = LaneId();
+
+ // Place unpacked digit counters in shared memory
+ #pragma unroll
+ for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+ {
+ int counter_lane = (LANE * WARPS) + warp_id;
+ if (counter_lane < COUNTER_LANES)
+ {
+ int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+ #pragma unroll
+ for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+ {
+ int bin_idx = digit_row + UNPACKED_COUNTER;
+
+ temp_storage.block_counters[warp_tid][bin_idx] =
+ local_counts[LANE][UNPACKED_COUNTER];
+ }
+ }
+ }
+
+ CTA_SYNC();
+
+ // Rake-reduce bin_count reductions
+ #pragma unroll
+ for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+ {
+ int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+ if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+ {
+ bin_count[track] = 0;
+
+ #pragma unroll
+ for (int i = 0; i < WARP_THREADS; ++i)
+ bin_count[track] += temp_storage.block_counters[i][bin_idx];
+ }
+ }
+ }
+
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/3rdparty/cub-1.8.0/cub/agent/agent_reduce.cuh b/3rdparty/cub-1.8.0/cub/agent/agent_reduce.cuh
new file mode 100644
index 00000000..000a905c
--- /dev/null
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_reduce.cuh
@@ -0,0 +1,385 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ */
+
+#pragma once
+
+#include
+
+#include "../block/block_load.cuh"
+#include "../block/block_reduce.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduce
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
+ BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use
+ CacheLoadModifier _LOAD_MODIFIER> ///< Cache load modifier for reading input elements
+struct AgentReducePolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
+ };
+
+ static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate. Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+ typename AgentReducePolicy, ///< Parameterized AgentReducePolicy tuning policy type
+ typename InputIteratorT, ///< Random-access iterator type for input
+ typename OutputIteratorT, ///< Random-access iterator type for output
+ typename OffsetT, ///< Signed integer type for global offsets
+ typename ReductionOp> ///< Binary reduction operator type having member T operator()(const T &a, const T &b)
+struct AgentReduce
+{
+
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ /// The input value type
+ typedef typename std::iterator_traits::value_type InputT;
+
+ /// The output value type
+ typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
+ typename std::iterator_traits::value_type, // ... then the input iterator's value type,
+ typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type
+
+ /// Vector type of InputT for data movement
+ typedef typename CubVector::Type VectorT;
+
+ /// Input iterator wrapper type (for applying cache modifier)
+ typedef typename If::VALUE,
+ CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator
+ InputIteratorT>::Type // Directly use the supplied input iterator type
+ WrappedInputIteratorT;
+
+ /// Constants
+ enum
+ {
+ BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD,
+ VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
+ TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+ // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
+ ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) &&
+ (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
+ (IsPointer::VALUE) && Traits::PRIMITIVE,
+
+ };
+
+ static const CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER;
+ static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
+
+ /// Parameterized BlockReduce primitive
+ typedef BlockReduce BlockReduceT;
+
+ /// Shared memory type required by this thread block
+ struct _TempStorage
+ {
+ typename BlockReduceT::TempStorage reduce;
+ };
+
+ /// Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ _TempStorage& temp_storage; ///< Reference to temp_storage
+ InputIteratorT d_in; ///< Input data to reduce
+ WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce
+ ReductionOp reduction_op; ///< Binary reduction operator
+
+
+ //---------------------------------------------------------------------
+ // Utility
+ //---------------------------------------------------------------------
+
+
+ // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
+ template
+ static __device__ __forceinline__ bool IsAligned(
+ Iterator d_in,
+ Int2Type /*can_vectorize*/)
+ {
+ return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+ }
+
+ // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
+ template
+ static __device__ __forceinline__ bool IsAligned(
+ Iterator /*d_in*/,
+ Int2Type /*can_vectorize*/)
+ {
+ return false;
+ }
+
+
+ //---------------------------------------------------------------------
+ // Constructor
+ //---------------------------------------------------------------------
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ AgentReduce(
+ TempStorage& temp_storage, ///< Reference to temp_storage
+ InputIteratorT d_in, ///< Input data to reduce
+ ReductionOp reduction_op) ///< Binary reduction operator
+ :
+ temp_storage(temp_storage.Alias()),
+ d_in(d_in),
+ d_wrapped_in(d_in),
+ reduction_op(reduction_op)
+ {}
+
+
+ //---------------------------------------------------------------------
+ // Tile consumption
+ //---------------------------------------------------------------------
+
+ /**
+ * Consume a full tile of input (non-vectorized)
+ */
+ template
+ __device__ __forceinline__ void ConsumeTile(
+ OutputT &thread_aggregate,
+ OffsetT block_offset, ///< The offset the tile to consume
+ int /*valid_items*/, ///< The number of valid items in the tile
+ Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile
+ Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads
+ {
+ OutputT items[ITEMS_PER_THREAD];
+
+ // Load items in striped fashion
+ LoadDirectStriped(threadIdx.x, d_wrapped_in + block_offset, items);
+
+ // Reduce items within each thread stripe
+ thread_aggregate = (IS_FIRST_TILE) ?
+ internal::ThreadReduce(items, reduction_op) :
+ internal::ThreadReduce(items, reduction_op, thread_aggregate);
+ }
+
+
+ /**
+ * Consume a full tile of input (vectorized)
+ */
+ template
+ __device__ __forceinline__ void ConsumeTile(
+ OutputT &thread_aggregate,
+ OffsetT block_offset, ///< The offset the tile to consume
+ int /*valid_items*/, ///< The number of valid items in the tile
+ Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile
+ Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads
+ {
+ // Alias items as an array of VectorT and load it in striped fashion
+ enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+ // Fabricate a vectorized input iterator
+ InputT *d_in_unqualified = const_cast(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
+ CacheModifiedInputIterator d_vec_in(
+ reinterpret_cast(d_in_unqualified));
+
+ // Load items as vector items
+ InputT input_items[ITEMS_PER_THREAD];
+ VectorT *vec_items = reinterpret_cast(input_items);
+ #pragma unroll
+ for (int i = 0; i < WORDS; ++i)
+ vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+
+ // Convert from input type to output type
+ OutputT items[ITEMS_PER_THREAD];
+ #pragma unroll
+ for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+ items[i] = input_items[i];
+
+ // Reduce items within each thread stripe
+ thread_aggregate = (IS_FIRST_TILE) ?
+ internal::ThreadReduce(items, reduction_op) :
+ internal::ThreadReduce(items, reduction_op, thread_aggregate);
+ }
+
+
+ /**
+ * Consume a partial tile of input
+ */
+ template
+ __device__ __forceinline__ void ConsumeTile(
+ OutputT &thread_aggregate,
+ OffsetT block_offset, ///< The offset the tile to consume
+ int valid_items, ///< The number of valid items in the tile
+ Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile
+ Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads
+ {
+ // Partial tile
+ int thread_offset = threadIdx.x;
+
+ // Read first item
+ if ((IS_FIRST_TILE) && (thread_offset < valid_items))
+ {
+ thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+ thread_offset += BLOCK_THREADS;
+ }
+
+ // Continue reading items (block-striped)
+ while (thread_offset < valid_items)
+ {
+ OutputT item = d_wrapped_in[block_offset + thread_offset];
+ thread_aggregate = reduction_op(thread_aggregate, item);
+ thread_offset += BLOCK_THREADS;
+ }
+ }
+
+
+ //---------------------------------------------------------------
+ // Consume a contiguous segment of tiles
+ //---------------------------------------------------------------------
+
+ /**
+ * \brief Reduce a contiguous segment of input tiles
+ */
+ template
+ __device__ __forceinline__ OutputT ConsumeRange(
+ GridEvenShare &even_share, ///< GridEvenShare descriptor
+ Int2Type can_vectorize) ///< Whether or not we can vectorize loads
+ {
+ OutputT thread_aggregate;
+
+ if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
+ {
+ // First tile isn't full (not all threads have valid items)
+ int valid_items = even_share.block_end - even_share.block_offset;
+ ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize);
+ return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
+ }
+
+ // At least one full block
+ ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize);
+ even_share.block_offset += even_share.block_stride;
+
+ // Consume subsequent full tiles of input
+ while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
+ {
+ ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize);
+ even_share.block_offset += even_share.block_stride;
+ }
+
+ // Consume a partially-full tile
+ if (even_share.block_offset < even_share.block_end)
+ {
+ int valid_items = even_share.block_end - even_share.block_offset;
+ ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize);
+ }
+
+ // Compute block-wide reduction (all threads have valid items)
+ return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
+ }
+
+
+ /**
+ * \brief Reduce a contiguous segment of input tiles
+ */
+ __device__ __forceinline__ OutputT ConsumeRange(
+ OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive)
+ OffsetT block_end) ///< [in] Threadblock end offset (exclusive)
+ {
+ GridEvenShare even_share;
+ even_share.template BlockInit(block_offset, block_end);
+
+ return (IsAligned(d_in + block_offset, Int2Type())) ?
+ ConsumeRange(even_share, Int2Type()) :
+ ConsumeRange(even_share, Int2Type());
+ }
+
+
+ /**
+ * Reduce a contiguous segment of input tiles
+ */
+ __device__ __forceinline__ OutputT ConsumeTiles(
+ GridEvenShare &even_share) ///< [in] GridEvenShare descriptor
+ {
+ // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
+ even_share.template BlockInit();
+
+ return (IsAligned(d_in, Int2Type())) ?
+ ConsumeRange(even_share, Int2Type()) :
+ ConsumeRange(even_share, Int2Type());
+
+ }
+
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/3rdparty/cub-1.8.0/cub/agent/agent_reduce_by_key.cuh b/3rdparty/cub-1.8.0/cub/agent/agent_reduce_by_key.cuh
new file mode 100644
index 00000000..51964d3e
--- /dev/null
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_reduce_by_key.cuh
@@ -0,0 +1,547 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduceByKey
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
+ CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
+ BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use
+struct AgentReduceByKeyPolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ };
+
+ static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
+ static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+ typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicy tuning policy type
+ typename KeysInputIteratorT, ///< Random-access input iterator type for keys
+ typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys
+ typename ValuesInputIteratorT, ///< Random-access input iterator type for values
+ typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values
+ typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of items selected
+ typename EqualityOpT, ///< KeyT equality operator type
+ typename ReductionOpT, ///< ValueT reduction operator type
+ typename OffsetT> ///< Signed integer type for global offsets
+struct AgentReduceByKey
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // The input keys type
+ typedef typename std::iterator_traits::value_type KeyInputT;
+
+ // The output keys type
+ typedef typename If<(Equals::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ?
+ typename std::iterator_traits::value_type, // ... then the input iterator's value type,
+ typename std::iterator_traits::value_type>::Type KeyOutputT; // ... else the output iterator's value type
+
+ // The input values type
+ typedef typename std::iterator_traits::value_type ValueInputT;
+
+ // The output values type
+ typedef typename If<(Equals::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ?
+ typename std::iterator_traits::value_type, // ... then the input iterator's value type,
+ typename std::iterator_traits::value_type>::Type ValueOutputT; // ... else the output iterator's value type
+
+ // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+ typedef KeyValuePair OffsetValuePairT;
+
+ // Tuple type for pairing keys and values
+ typedef KeyValuePair KeyValuePairT;
+
+ // Tile status descriptor interface type
+ typedef ReduceByKeyScanTileState ScanTileStateT;
+
+ // Guarded inequality functor
+ template
+ struct GuardedInequalityWrapper
+ {
+ _EqualityOpT op; ///< Wrapped equality operator
+ int num_remaining; ///< Items remaining
+
+ /// Constructor
+ __host__ __device__ __forceinline__
+ GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {}
+
+ /// Boolean inequality operator, returns (a != b)
+ template
+ __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const
+ {
+ if (idx < num_remaining)
+ return !op(a, b); // In bounds
+
+ // Return true if first out-of-bounds item, false otherwise
+ return (idx == num_remaining);
+ }
+ };
+
+
+ // Constants
+ enum
+ {
+ BLOCK_THREADS = AgentReduceByKeyPolicyT::BLOCK_THREADS,
+ ITEMS_PER_THREAD = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD,
+ TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+ TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1),
+
+ // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+ HAS_IDENTITY_ZERO = (Equals::VALUE) && (Traits::PRIMITIVE),
+ };
+
+ // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+ typedef typename If::VALUE,
+ CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator
+ KeysInputIteratorT>::Type // Directly use the supplied input iterator type
+ WrappedKeysInputIteratorT;
+
+ // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+ typedef typename If::VALUE,
+ CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator
+ ValuesInputIteratorT>::Type // Directly use the supplied input iterator type
+ WrappedValuesInputIteratorT;
+
+ // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+ typedef typename If::VALUE,
+ CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator
+ AggregatesOutputIteratorT>::Type // Directly use the supplied input iterator type
+ WrappedFixupInputIteratorT;
+
+ // Reduce-value-by-segment scan operator
+ typedef ReduceBySegmentOp ReduceBySegmentOpT;
+
+ // Parameterized BlockLoad type for keys
+ typedef BlockLoad<
+ KeyOutputT,
+ BLOCK_THREADS,
+ ITEMS_PER_THREAD,
+ AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+ BlockLoadKeysT;
+
+ // Parameterized BlockLoad type for values
+ typedef BlockLoad<
+ ValueOutputT,
+ BLOCK_THREADS,
+ ITEMS_PER_THREAD,
+ AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+ BlockLoadValuesT;
+
+ // Parameterized BlockDiscontinuity type for keys
+ typedef BlockDiscontinuity<
+ KeyOutputT,
+ BLOCK_THREADS>
+ BlockDiscontinuityKeys;
+
+ // Parameterized BlockScan type
+ typedef BlockScan<
+ OffsetValuePairT,
+ BLOCK_THREADS,
+ AgentReduceByKeyPolicyT::SCAN_ALGORITHM>
+ BlockScanT;
+
+ // Callback type for obtaining tile prefix during block scan
+ typedef TilePrefixCallbackOp<
+ OffsetValuePairT,
+ ReduceBySegmentOpT,
+ ScanTileStateT>
+ TilePrefixCallbackOpT;
+
+ // Key and value exchange types
+ typedef KeyOutputT KeyExchangeT[TILE_ITEMS + 1];
+ typedef ValueOutputT ValueExchangeT[TILE_ITEMS + 1];
+
+ // Shared memory type for this thread block
+ union _TempStorage
+ {
+ struct
+ {
+ typename BlockScanT::TempStorage scan; // Smem needed for tile scanning
+ typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback
+ typename BlockDiscontinuityKeys::TempStorage discontinuity; // Smem needed for discontinuity detection
+ };
+
+ // Smem needed for loading keys
+ typename BlockLoadKeysT::TempStorage load_keys;
+
+ // Smem needed for loading values
+ typename BlockLoadValuesT::TempStorage load_values;
+
+ // Smem needed for compacting key value pairs(allows non POD items in this union)
+ Uninitialized raw_exchange;
+ };
+
+ // Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ _TempStorage& temp_storage; ///< Reference to temp_storage
+ WrappedKeysInputIteratorT d_keys_in; ///< Input keys
+ UniqueOutputIteratorT d_unique_out; ///< Unique output keys
+ WrappedValuesInputIteratorT d_values_in; ///< Input values
+ AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates
+ NumRunsOutputIteratorT d_num_runs_out; ///< Output pointer for total number of segments identified
+ EqualityOpT equality_op; ///< KeyT equality operator
+ ReductionOpT reduction_op; ///< Reduction operator
+ ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator
+
+
+ //---------------------------------------------------------------------
+ // Constructor
+ //---------------------------------------------------------------------
+
+ // Constructor
+ __device__ __forceinline__
+ AgentReduceByKey(
+ TempStorage& temp_storage, ///< Reference to temp_storage
+ KeysInputIteratorT d_keys_in, ///< Input keys
+ UniqueOutputIteratorT d_unique_out, ///< Unique output keys
+ ValuesInputIteratorT d_values_in, ///< Input values
+ AggregatesOutputIteratorT d_aggregates_out, ///< Output value aggregates
+ NumRunsOutputIteratorT d_num_runs_out, ///< Output pointer for total number of segments identified
+ EqualityOpT equality_op, ///< KeyT equality operator
+ ReductionOpT reduction_op) ///< ValueT reduction operator
+ :
+ temp_storage(temp_storage.Alias()),
+ d_keys_in(d_keys_in),
+ d_unique_out(d_unique_out),
+ d_values_in(d_values_in),
+ d_aggregates_out(d_aggregates_out),
+ d_num_runs_out(d_num_runs_out),
+ equality_op(equality_op),
+ reduction_op(reduction_op),
+ scan_op(reduction_op)
+ {}
+
+
+ //---------------------------------------------------------------------
+ // Scatter utility methods
+ //---------------------------------------------------------------------
+
+ /**
+ * Directly scatter flagged items to output offsets
+ */
+ __device__ __forceinline__ void ScatterDirect(
+ KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD],
+ OffsetT (&segment_flags)[ITEMS_PER_THREAD],
+ OffsetT (&segment_indices)[ITEMS_PER_THREAD])
+ {
+ // Scatter flagged keys and values
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ if (segment_flags[ITEM])
+ {
+ d_unique_out[segment_indices[ITEM]] = scatter_items[ITEM].key;
+ d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
+ }
+ }
+ }
+
+
+ /**
+ * 2-phase scatter flagged items to output offsets
+ *
+ * The exclusive scan causes each head flag to be paired with the previous
+ * value aggregate: the scatter offsets must be decremented for value aggregates
+ */
+ __device__ __forceinline__ void ScatterTwoPhase(
+ KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD],
+ OffsetT (&segment_flags)[ITEMS_PER_THREAD],
+ OffsetT (&segment_indices)[ITEMS_PER_THREAD],
+ OffsetT num_tile_segments,
+ OffsetT num_tile_segments_prefix)
+ {
+ CTA_SYNC();
+
+ // Compact and scatter pairs
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ if (segment_flags[ITEM])
+ {
+ temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
+ }
+ }
+
+ CTA_SYNC();
+
+ for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
+ {
+ KeyValuePairT pair = temp_storage.raw_exchange.Alias()[item];
+ d_unique_out[num_tile_segments_prefix + item] = pair.key;
+ d_aggregates_out[num_tile_segments_prefix + item] = pair.value;
+ }
+ }
+
+
+ /**
+ * Scatter flagged items
+ */
+ __device__ __forceinline__ void Scatter(
+ KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD],
+ OffsetT (&segment_flags)[ITEMS_PER_THREAD],
+ OffsetT (&segment_indices)[ITEMS_PER_THREAD],
+ OffsetT num_tile_segments,
+ OffsetT num_tile_segments_prefix)
+ {
+ // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
+ if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
+ {
+ ScatterTwoPhase(
+ scatter_items,
+ segment_flags,
+ segment_indices,
+ num_tile_segments,
+ num_tile_segments_prefix);
+ }
+ else
+ {
+ ScatterDirect(
+ scatter_items,
+ segment_flags,
+ segment_indices);
+ }
+ }
+
+
+ //---------------------------------------------------------------------
+ // Cooperatively scan a device-wide sequence of tiles with other CTAs
+ //---------------------------------------------------------------------
+
+ /**
+ * Process a tile of input (dynamic chained scan)
+ */
+ template ///< Whether the current tile is the last tile
+ __device__ __forceinline__ void ConsumeTile(
+ OffsetT num_remaining, ///< Number of global input items remaining (including this tile)
+ int tile_idx, ///< Tile index
+ OffsetT tile_offset, ///< Tile offset
+ ScanTileStateT& tile_state) ///< Global tile state descriptor
+ {
+ KeyOutputT keys[ITEMS_PER_THREAD]; // Tile keys
+ KeyOutputT prev_keys[ITEMS_PER_THREAD]; // Tile keys shuffled up
+ ValueOutputT values[ITEMS_PER_THREAD]; // Tile values
+ OffsetT head_flags[ITEMS_PER_THREAD]; // Segment head flags
+ OffsetT segment_indices[ITEMS_PER_THREAD]; // Segment indices
+ OffsetValuePairT scan_items[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices
+ KeyValuePairT scatter_items[ITEMS_PER_THREAD]; // Zipped key value pairs for scattering
+
+ // Load keys
+ if (IS_LAST_TILE)
+ BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
+ else
+ BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+
+ // Load tile predecessor key in first thread
+ KeyOutputT tile_predecessor;
+ if (threadIdx.x == 0)
+ {
+ tile_predecessor = (tile_idx == 0) ?
+ keys[0] : // First tile gets repeat of first item (thus first item will not be flagged as a head)
+ d_keys_in[tile_offset - 1]; // Subsequent tiles get last key from previous tile
+ }
+
+ CTA_SYNC();
+
+ // Load values
+ if (IS_LAST_TILE)
+ BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
+ else
+ BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
+
+ CTA_SYNC();
+
+ // Initialize head-flags and shuffle up the previous keys
+ if (IS_LAST_TILE)
+ {
+ // Use custom flag operator to additionally flag the first out-of-bounds item
+ GuardedInequalityWrapper flag_op(equality_op, num_remaining);
+ BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+ head_flags, keys, prev_keys, flag_op, tile_predecessor);
+ }
+ else
+ {
+ InequalityWrapper flag_op(equality_op);
+ BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+ head_flags, keys, prev_keys, flag_op, tile_predecessor);
+ }
+
+ // Zip values and head flags
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ scan_items[ITEM].value = values[ITEM];
+ scan_items[ITEM].key = head_flags[ITEM];
+ }
+
+ // Perform exclusive tile scan
+ OffsetValuePairT block_aggregate; // Inclusive block-wide scan aggregate
+ OffsetT num_segments_prefix; // Number of segments prior to this tile
+ OffsetValuePairT total_aggregate; // The tile prefix folded with block_aggregate
+ if (tile_idx == 0)
+ {
+ // Scan first tile
+ BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
+ num_segments_prefix = 0;
+ total_aggregate = block_aggregate;
+
+ // Update tile status if there are successor tiles
+ if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+ tile_state.SetInclusive(0, block_aggregate);
+ }
+ else
+ {
+ // Scan non-first tile
+ TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+ BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
+
+ block_aggregate = prefix_op.GetBlockAggregate();
+ num_segments_prefix = prefix_op.GetExclusivePrefix().key;
+ total_aggregate = prefix_op.GetInclusivePrefix();
+ }
+
+ // Rezip scatter items and segment indices
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ scatter_items[ITEM].key = prev_keys[ITEM];
+ scatter_items[ITEM].value = scan_items[ITEM].value;
+ segment_indices[ITEM] = scan_items[ITEM].key;
+ }
+
+ // At this point, each flagged segment head has:
+ // - The key for the previous segment
+ // - The reduced value from the previous segment
+ // - The segment index for the reduced value
+
+ // Scatter flagged keys and values
+ OffsetT num_tile_segments = block_aggregate.key;
+ Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);
+
+ // Last thread in last tile will output final count (and last pair, if necessary)
+ if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
+ {
+ OffsetT num_segments = num_segments_prefix + num_tile_segments;
+
+ // If the last tile is a whole tile, output the final_value
+ if (num_remaining == TILE_ITEMS)
+ {
+ d_unique_out[num_segments] = keys[ITEMS_PER_THREAD - 1];
+ d_aggregates_out[num_segments] = total_aggregate.value;
+ num_segments++;
+ }
+
+ // Output the total number of items selected
+ *d_num_runs_out = num_segments;
+ }
+ }
+
+
+ /**
+ * Scan tiles of items as part of a dynamic chained scan
+ */
+ __device__ __forceinline__ void ConsumeRange(
+ int num_items, ///< Total number of input items
+ ScanTileStateT& tile_state, ///< Global tile state descriptor
+ int start_tile) ///< The starting tile for the current grid
+ {
+ // Blocks are launched in increasing order, so just assign one tile per block
+ int tile_idx = start_tile + blockIdx.x; // Current tile index
+ OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Global offset for the current tile
+ OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile)
+
+ if (num_remaining > TILE_ITEMS)
+ {
+ // Not last tile
+ ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state);
+ }
+ else if (num_remaining > 0)
+ {
+ // Last tile
+ ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state);
+ }
+ }
+
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/3rdparty/cub-1.8.0/cub/agent/agent_rle.cuh b/3rdparty/cub-1.8.0/cub/agent/agent_rle.cuh
new file mode 100644
index 00000000..cb7a4a65
--- /dev/null
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_rle.cuh
@@ -0,0 +1,837 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode.
+ */
+
+#pragma once
+
+#include
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRle
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
+ CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
+ bool _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+ BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use
+struct AgentRlePolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+ };
+
+ static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
+ static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use
+};
+
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode
+ */
+template <
+ typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type
+ typename InputIteratorT, ///< Random-access input iterator type for data
+ typename OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
+ typename LengthsOutputIteratorT, ///< Random-access output iterator type for length values
+ typename EqualityOpT, ///< T equality operator type
+ typename OffsetT> ///< Signed integer type for global offsets
+struct AgentRle
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ /// The input value type
+ typedef typename std::iterator_traits::value_type T;
+
+ /// The lengths output value type
+ typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ?
+ OffsetT, // ... then the OffsetT type,
+ typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type
+
+ /// Tuple type for scanning (pairs run-length and run-index)
+ typedef KeyValuePair LengthOffsetPair;
+
+ /// Tile status descriptor interface type
+ typedef ReduceByKeyScanTileState ScanTileStateT;
+
+ // Constants
+ enum
+ {
+ WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+ BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS,
+ ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD,
+ WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD,
+ TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+ WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+ /// Whether or not to sync after loading data
+ SYNC_AFTER_LOAD = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+ /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+ STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
+ ACTIVE_EXCHANGE_WARPS = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
+ };
+
+
+ /**
+ * Special operator that signals all out-of-bounds items are not equal to everything else,
+ * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
+ * trivial.
+ */
+ template
+ struct OobInequalityOp
+ {
+ OffsetT num_remaining;
+ EqualityOpT equality_op;
+
+ __device__ __forceinline__ OobInequalityOp(
+ OffsetT num_remaining,
+ EqualityOpT equality_op)
+ :
+ num_remaining(num_remaining),
+ equality_op(equality_op)
+ {}
+
+ template
+ __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx)
+ {
+ if (!LAST_TILE || (idx < num_remaining))
+ return !equality_op(first, second);
+ else
+ return true;
+ }
+ };
+
+
+ // Cache-modified Input iterator wrapper type (for applying cache modifier) for data
+ typedef typename If::VALUE,
+ CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedVLengthnputIterator
+ InputIteratorT>::Type // Directly use the supplied input iterator type
+ WrappedInputIteratorT;
+
+ // Parameterized BlockLoad type for data
+ typedef BlockLoad<
+ T,
+ AgentRlePolicyT::BLOCK_THREADS,
+ AgentRlePolicyT::ITEMS_PER_THREAD,
+ AgentRlePolicyT::LOAD_ALGORITHM>
+ BlockLoadT;
+
+ // Parameterized BlockDiscontinuity type for data
+ typedef BlockDiscontinuity BlockDiscontinuityT;
+
+ // Parameterized WarpScan type
+ typedef WarpScan WarpScanPairs;
+
+ // Reduce-length-by-run scan operator
+ typedef ReduceBySegmentOp ReduceBySegmentOpT;
+
+ // Callback type for obtaining tile prefix during block scan
+ typedef TilePrefixCallbackOp<
+ LengthOffsetPair,
+ ReduceBySegmentOpT,
+ ScanTileStateT>
+ TilePrefixCallbackOpT;
+
+ // Warp exchange types
+ typedef WarpExchange WarpExchangePairs;
+
+ typedef typename If::Type WarpExchangePairsStorage;
+
+ typedef WarpExchange WarpExchangeOffsets;
+ typedef WarpExchange WarpExchangeLengths;
+
+ typedef LengthOffsetPair WarpAggregates[WARPS];
+
+ // Shared memory type for this thread block
+ struct _TempStorage
+ {
+ // Aliasable storage layout
+ union Aliasable
+ {
+ struct
+ {
+ typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection
+ typename WarpScanPairs::TempStorage warp_scan[WARPS]; // Smem needed for warp-synchronous scans
+ Uninitialized warp_aggregates; // Smem needed for sharing warp-wide aggregates
+ typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback
+ };
+
+ // Smem needed for input loading
+ typename BlockLoadT::TempStorage load;
+
+ // Aliasable layout needed for two-phase scatter
+ union ScatterAliasable
+ {
+ unsigned long long align;
+ WarpExchangePairsStorage exchange_pairs[ACTIVE_EXCHANGE_WARPS];
+ typename WarpExchangeOffsets::TempStorage exchange_offsets[ACTIVE_EXCHANGE_WARPS];
+ typename WarpExchangeLengths::TempStorage exchange_lengths[ACTIVE_EXCHANGE_WARPS];
+
+ } scatter_aliasable;
+
+ } aliasable;
+
+ OffsetT tile_idx; // Shared tile index
+ LengthOffsetPair tile_inclusive; // Inclusive tile prefix
+ LengthOffsetPair tile_exclusive; // Exclusive tile prefix
+ };
+
+ // Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ _TempStorage& temp_storage; ///< Reference to temp_storage
+
+ WrappedInputIteratorT d_in; ///< Pointer to input sequence of data items
+ OffsetsOutputIteratorT d_offsets_out; ///< Input run offsets
+ LengthsOutputIteratorT d_lengths_out; ///< Output run lengths
+
+ EqualityOpT equality_op; ///< T equality operator
+ ReduceBySegmentOpT scan_op; ///< Reduce-length-by-flag scan operator
+ OffsetT num_items; ///< Total number of input items
+
+
+ //---------------------------------------------------------------------
+ // Constructor
+ //---------------------------------------------------------------------
+
+ // Constructor
+ __device__ __forceinline__
+ AgentRle(
+ TempStorage &temp_storage, ///< [in] Reference to temp_storage
+ InputIteratorT d_in, ///< [in] Pointer to input sequence of data items
+ OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run offsets
+ LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run lengths
+ EqualityOpT equality_op, ///< [in] T equality operator
+ OffsetT num_items) ///< [in] Total number of input items
+ :
+ temp_storage(temp_storage.Alias()),
+ d_in(d_in),
+ d_offsets_out(d_offsets_out),
+ d_lengths_out(d_lengths_out),
+ equality_op(equality_op),
+ scan_op(cub::Sum()),
+ num_items(num_items)
+ {}
+
+
+ //---------------------------------------------------------------------
+ // Utility methods for initializing the selections
+ //---------------------------------------------------------------------
+
+ template
+ __device__ __forceinline__ void InitializeSelections(
+ OffsetT tile_offset,
+ OffsetT num_remaining,
+ T (&items)[ITEMS_PER_THREAD],
+ LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+ {
+ bool head_flags[ITEMS_PER_THREAD];
+ bool tail_flags[ITEMS_PER_THREAD];
+
+ OobInequalityOp inequality_op(num_remaining, equality_op);
+
+ if (FIRST_TILE && LAST_TILE)
+ {
+ // First-and-last-tile always head-flags the first item and tail-flags the last item
+
+ BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+ head_flags, tail_flags, items, inequality_op);
+ }
+ else if (FIRST_TILE)
+ {
+ // First-tile always head-flags the first item
+
+ // Get the first item from the next tile
+ T tile_successor_item;
+ if (threadIdx.x == BLOCK_THREADS - 1)
+ tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+ BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+ head_flags, tail_flags, tile_successor_item, items, inequality_op);
+ }
+ else if (LAST_TILE)
+ {
+ // Last-tile always flags the last item
+
+ // Get the last item from the previous tile
+ T tile_predecessor_item;
+ if (threadIdx.x == 0)
+ tile_predecessor_item = d_in[tile_offset - 1];
+
+ BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+ head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
+ }
+ else
+ {
+ // Get the first item from the next tile
+ T tile_successor_item;
+ if (threadIdx.x == BLOCK_THREADS - 1)
+ tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+ // Get the last item from the previous tile
+ T tile_predecessor_item;
+ if (threadIdx.x == 0)
+ tile_predecessor_item = d_in[tile_offset - 1];
+
+ BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+ head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
+ }
+
+ // Zip counts and runs
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ lengths_and_num_runs[ITEM].key = head_flags[ITEM] && (!tail_flags[ITEM]);
+ lengths_and_num_runs[ITEM].value = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
+ }
+ }
+
+ //---------------------------------------------------------------------
+ // Scan utility methods
+ //---------------------------------------------------------------------
+
+ /**
+ * Scan of allocations
+ */
+ __device__ __forceinline__ void WarpScanAllocations(
+ LengthOffsetPair &tile_aggregate,
+ LengthOffsetPair &warp_aggregate,
+ LengthOffsetPair &warp_exclusive_in_tile,
+ LengthOffsetPair &thread_exclusive_in_warp,
+ LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+ {
+ // Perform warpscans
+ unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+ int lane_id = LaneId();
+
+ LengthOffsetPair identity;
+ identity.key = 0;
+ identity.value = 0;
+
+ LengthOffsetPair thread_inclusive;
+ LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);
+ WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan(
+ thread_aggregate,
+ thread_inclusive,
+ thread_exclusive_in_warp,
+ identity,
+ scan_op);
+
+ // Last lane in each warp shares its warp-aggregate
+ if (lane_id == WARP_THREADS - 1)
+ temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive;
+
+ CTA_SYNC();
+
+ // Accumulate total selected and the warp-wide prefix
+ warp_exclusive_in_tile = identity;
+ warp_aggregate = temp_storage.aliasable.warp_aggregates.Alias()[warp_id];
+ tile_aggregate = temp_storage.aliasable.warp_aggregates.Alias()[0];
+
+ #pragma unroll
+ for (int WARP = 1; WARP < WARPS; ++WARP)
+ {
+ if (warp_id == WARP)
+ warp_exclusive_in_tile = tile_aggregate;
+
+ tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]);
+ }
+ }
+
+
+ //---------------------------------------------------------------------
+ // Utility methods for scattering selections
+ //---------------------------------------------------------------------
+
+ /**
+ * Two-phase scatter, specialized for warp time-slicing
+ */
+ template
+ __device__ __forceinline__ void ScatterTwoPhase(
+ OffsetT tile_num_runs_exclusive_in_global,
+ OffsetT warp_num_runs_aggregate,
+ OffsetT warp_num_runs_exclusive_in_tile,
+ OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+ LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD],
+ Int2Type is_warp_time_slice)
+ {
+ unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+ int lane_id = LaneId();
+
+ // Locally compact items within the warp (first warp)
+ if (warp_id == 0)
+ {
+ WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+ lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+ }
+
+ // Locally compact items within the warp (remaining warps)
+ #pragma unroll
+ for (int SLICE = 1; SLICE < WARPS; ++SLICE)
+ {
+ CTA_SYNC();
+
+ if (warp_id == SLICE)
+ {
+ WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+ lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+ }
+ }
+
+ // Global scatter
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
+ {
+ OffsetT item_offset =
+ tile_num_runs_exclusive_in_global +
+ warp_num_runs_exclusive_in_tile +
+ (ITEM * WARP_THREADS) + lane_id;
+
+ // Scatter offset
+ d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+ // Scatter length if not the first (global) length
+ if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+ {
+ d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+ }
+ }
+ }
+ }
+
+
+ /**
+ * Two-phase scatter
+ */
+ template
+ __device__ __forceinline__ void ScatterTwoPhase(
+ OffsetT tile_num_runs_exclusive_in_global,
+ OffsetT warp_num_runs_aggregate,
+ OffsetT warp_num_runs_exclusive_in_tile,
+ OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+ LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD],
+ Int2Type is_warp_time_slice)
+ {
+ unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+ int lane_id = LaneId();
+
+ // Unzip
+ OffsetT run_offsets[ITEMS_PER_THREAD];
+ LengthT run_lengths[ITEMS_PER_THREAD];
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ run_offsets[ITEM] = lengths_and_offsets[ITEM].key;
+ run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
+ }
+
+ WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped(
+ run_offsets, thread_num_runs_exclusive_in_warp);
+
+ WARP_SYNC(0xffffffff);
+
+ WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped(
+ run_lengths, thread_num_runs_exclusive_in_warp);
+
+ // Global scatter
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
+ {
+ OffsetT item_offset =
+ tile_num_runs_exclusive_in_global +
+ warp_num_runs_exclusive_in_tile +
+ (ITEM * WARP_THREADS) + lane_id;
+
+ // Scatter offset
+ d_offsets_out[item_offset] = run_offsets[ITEM];
+
+ // Scatter length if not the first (global) length
+ if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+ {
+ d_lengths_out[item_offset - 1] = run_lengths[ITEM];
+ }
+ }
+ }
+ }
+
+
+ /**
+ * Direct scatter
+ */
+ template
+ __device__ __forceinline__ void ScatterDirect(
+ OffsetT tile_num_runs_exclusive_in_global,
+ OffsetT warp_num_runs_aggregate,
+ OffsetT warp_num_runs_exclusive_in_tile,
+ OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+ LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD])
+ {
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
+ {
+ OffsetT item_offset =
+ tile_num_runs_exclusive_in_global +
+ warp_num_runs_exclusive_in_tile +
+ thread_num_runs_exclusive_in_warp[ITEM];
+
+ // Scatter offset
+ d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+ // Scatter length if not the first (global) length
+ if (item_offset >= 1)
+ {
+ d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+ }
+ }
+ }
+ }
+
+
+ /**
+ * Scatter
+ */
+ template
+ __device__ __forceinline__ void Scatter(
+ OffsetT tile_num_runs_aggregate,
+ OffsetT tile_num_runs_exclusive_in_global,
+ OffsetT warp_num_runs_aggregate,
+ OffsetT warp_num_runs_exclusive_in_tile,
+ OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+ LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD])
+ {
+ if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
+ {
+ // Direct scatter if the warp has any items
+ if (warp_num_runs_aggregate)
+ {
+ ScatterDirect(
+ tile_num_runs_exclusive_in_global,
+ warp_num_runs_aggregate,
+ warp_num_runs_exclusive_in_tile,
+ thread_num_runs_exclusive_in_warp,
+ lengths_and_offsets);
+ }
+ }
+ else
+ {
+ // Scatter two phase
+ ScatterTwoPhase(
+ tile_num_runs_exclusive_in_global,
+ warp_num_runs_aggregate,
+ warp_num_runs_exclusive_in_tile,
+ thread_num_runs_exclusive_in_warp,
+ lengths_and_offsets,
+ Int2Type());
+ }
+ }
+
+
+
+ //---------------------------------------------------------------------
+ // Cooperatively scan a device-wide sequence of tiles with other CTAs
+ //---------------------------------------------------------------------
+
+ /**
+ * Process a tile of input (dynamic chained scan)
+ */
+ template <
+ bool LAST_TILE>
+ __device__ __forceinline__ LengthOffsetPair ConsumeTile(
+ OffsetT num_items, ///< Total number of global input items
+ OffsetT num_remaining, ///< Number of global input items remaining (including this tile)
+ int tile_idx, ///< Tile index
+ OffsetT tile_offset, ///< Tile offset
+ ScanTileStateT &tile_status) ///< Global list of tile status
+ {
+ if (tile_idx == 0)
+ {
+ // First tile
+
+ // Load items
+ T items[ITEMS_PER_THREAD];
+ if (LAST_TILE)
+ BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+ else
+ BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+ if (SYNC_AFTER_LOAD)
+ CTA_SYNC();
+
+ // Set flags
+ LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD];
+
+ InitializeSelections(
+ tile_offset,
+ num_remaining,
+ items,
+ lengths_and_num_runs);
+
+ // Exclusive scan of lengths and runs
+ LengthOffsetPair tile_aggregate;
+ LengthOffsetPair warp_aggregate;
+ LengthOffsetPair warp_exclusive_in_tile;
+ LengthOffsetPair thread_exclusive_in_warp;
+
+ WarpScanAllocations(
+ tile_aggregate,
+ warp_aggregate,
+ warp_exclusive_in_tile,
+ thread_exclusive_in_warp,
+ lengths_and_num_runs);
+
+ // Update tile status if this is not the last tile
+ if (!LAST_TILE && (threadIdx.x == 0))
+ tile_status.SetInclusive(0, tile_aggregate);
+
+ // Update thread_exclusive_in_warp to fold in warp run-length
+ if (thread_exclusive_in_warp.key == 0)
+ thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
+
+ LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD];
+ OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+ LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD];
+
+ // Downsweep scan through lengths_and_num_runs
+ internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+ // Zip
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value;
+ lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+ thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+ lengths_and_num_runs2[ITEM].key : // keep
+ WARP_THREADS * ITEMS_PER_THREAD; // discard
+ }
+
+ OffsetT tile_num_runs_aggregate = tile_aggregate.key;
+ OffsetT tile_num_runs_exclusive_in_global = 0;
+ OffsetT warp_num_runs_aggregate = warp_aggregate.key;
+ OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key;
+
+ // Scatter
+ Scatter(
+ tile_num_runs_aggregate,
+ tile_num_runs_exclusive_in_global,
+ warp_num_runs_aggregate,
+ warp_num_runs_exclusive_in_tile,
+ thread_num_runs_exclusive_in_warp,
+ lengths_and_offsets);
+
+ // Return running total (inclusive of this tile)
+ return tile_aggregate;
+ }
+ else
+ {
+ // Not first tile
+
+ // Load items
+ T items[ITEMS_PER_THREAD];
+ if (LAST_TILE)
+ BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+ else
+ BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+ if (SYNC_AFTER_LOAD)
+ CTA_SYNC();
+
+ // Set flags
+ LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD];
+
+ InitializeSelections(
+ tile_offset,
+ num_remaining,
+ items,
+ lengths_and_num_runs);
+
+ // Exclusive scan of lengths and runs
+ LengthOffsetPair tile_aggregate;
+ LengthOffsetPair warp_aggregate;
+ LengthOffsetPair warp_exclusive_in_tile;
+ LengthOffsetPair thread_exclusive_in_warp;
+
+ WarpScanAllocations(
+ tile_aggregate,
+ warp_aggregate,
+ warp_exclusive_in_tile,
+ thread_exclusive_in_warp,
+ lengths_and_num_runs);
+
+ // First warp computes tile prefix in lane 0
+ TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx);
+ unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+ if (warp_id == 0)
+ {
+ prefix_op(tile_aggregate);
+ if (threadIdx.x == 0)
+ temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
+ }
+
+ CTA_SYNC();
+
+ LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
+
+ // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
+ LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
+ if (thread_exclusive_in_warp.key == 0)
+ thread_exclusive_in_warp.value += thread_exclusive.value;
+
+ // Downsweep scan through lengths_and_num_runs
+ LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD];
+ LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD];
+ OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+
+ internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+ // Zip
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value;
+ lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+ thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+ lengths_and_num_runs2[ITEM].key : // keep
+ WARP_THREADS * ITEMS_PER_THREAD; // discard
+ }
+
+ OffsetT tile_num_runs_aggregate = tile_aggregate.key;
+ OffsetT tile_num_runs_exclusive_in_global = tile_exclusive_in_global.key;
+ OffsetT warp_num_runs_aggregate = warp_aggregate.key;
+ OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key;
+
+ // Scatter
+ Scatter(
+ tile_num_runs_aggregate,
+ tile_num_runs_exclusive_in_global,
+ warp_num_runs_aggregate,
+ warp_num_runs_exclusive_in_tile,
+ thread_num_runs_exclusive_in_warp,
+ lengths_and_offsets);
+
+ // Return running total (inclusive of this tile)
+ return prefix_op.inclusive_prefix;
+ }
+ }
+
+
+ /**
+ * Scan tiles of items as part of a dynamic chained scan
+ */
+ template ///< Output iterator type for recording number of items selected
+ __device__ __forceinline__ void ConsumeRange(
+ int num_tiles, ///< Total number of input tiles
+ ScanTileStateT& tile_status, ///< Global list of tile status
+ NumRunsIteratorT d_num_runs_out) ///< Output pointer for total number of runs identified
+ {
+ // Blocks are launched in increasing order, so just assign one tile per block
+ int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
+ OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile
+ OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile)
+
+ if (tile_idx < num_tiles - 1)
+ {
+ // Not the last tile (full)
+ ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+ }
+ else if (num_remaining > 0)
+ {
+ // The last tile (possibly partially-full)
+ LengthOffsetPair running_total = ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+
+ if (threadIdx.x == 0)
+ {
+ // Output the total number of items selected
+ *d_num_runs_out = running_total.key;
+
+ // The inclusive prefix contains accumulated length reduction for the last run
+ if (running_total.key > 0)
+ d_lengths_out[running_total.key - 1] = running_total.value;
+ }
+ }
+ }
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/3rdparty/cub-1.8.0/cub/agent/agent_scan.cuh b/3rdparty/cub-1.8.0/cub/agent/agent_scan.cuh
new file mode 100644
index 00000000..9368615e
--- /dev/null
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_scan.cuh
@@ -0,0 +1,471 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+
+#pragma once
+
+#include
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentScan
+ */
+template <
+ int _BLOCK_THREADS, ///< Threads per thread block
+ int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
+ CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
+ BlockStoreAlgorithm _STORE_ALGORITHM, ///< The BlockStore algorithm to use
+ BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use
+struct AgentScanPolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
+ };
+
+ static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
+ static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
+ static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; ///< The BlockStore algorithm to use
+ static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+template <
+ typename AgentScanPolicyT, ///< Parameterized AgentScanPolicyT tuning policy type
+ typename InputIteratorT, ///< Random-access input iterator type
+ typename OutputIteratorT, ///< Random-access output iterator type
+ typename ScanOpT, ///< Scan functor type
+ typename InitValueT, ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+ typename OffsetT> ///< Signed integer type for global offsets
+struct AgentScan
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // The input value type
+ typedef typename std::iterator_traits::value_type InputT;
+
+ // The output value type
+ typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
+ typename std::iterator_traits::value_type, // ... then the input iterator's value type,
+ typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type
+
+ // Tile status descriptor interface type
+ typedef ScanTileState ScanTileStateT;
+
+ // Input iterator wrapper type (for applying cache modifier)
+ typedef typename If::VALUE,
+ CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator
+ InputIteratorT>::Type // Directly use the supplied input iterator type
+ WrappedInputIteratorT;
+
+ // Constants
+ enum
+ {
+ IS_INCLUSIVE = Equals::VALUE, // Inclusive scan if no init_value type is provided
+ BLOCK_THREADS = AgentScanPolicyT::BLOCK_THREADS,
+ ITEMS_PER_THREAD = AgentScanPolicyT::ITEMS_PER_THREAD,
+ TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+ };
+
+ // Parameterized BlockLoad type
+ typedef BlockLoad<
+ OutputT,
+ AgentScanPolicyT::BLOCK_THREADS,
+ AgentScanPolicyT::ITEMS_PER_THREAD,
+ AgentScanPolicyT::LOAD_ALGORITHM>
+ BlockLoadT;
+
+ // Parameterized BlockStore type
+ typedef BlockStore<
+ OutputT,
+ AgentScanPolicyT::BLOCK_THREADS,
+ AgentScanPolicyT::ITEMS_PER_THREAD,
+ AgentScanPolicyT::STORE_ALGORITHM>
+ BlockStoreT;
+
+ // Parameterized BlockScan type
+ typedef BlockScan<
+ OutputT,
+ AgentScanPolicyT::BLOCK_THREADS,
+ AgentScanPolicyT::SCAN_ALGORITHM>
+ BlockScanT;
+
+ // Callback type for obtaining tile prefix during block scan
+ typedef TilePrefixCallbackOp<
+ OutputT,
+ ScanOpT,
+ ScanTileStateT>
+ TilePrefixCallbackOpT;
+
+ // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+ typedef BlockScanRunningPrefixOp<
+ OutputT,
+ ScanOpT>
+ RunningPrefixCallbackOp;
+
+ // Shared memory type for this thread block
+ union _TempStorage
+ {
+ typename BlockLoadT::TempStorage load; // Smem needed for tile loading
+ typename BlockStoreT::TempStorage store; // Smem needed for tile storing
+
+ struct
+ {
+ typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback
+ typename BlockScanT::TempStorage scan; // Smem needed for tile scanning
+ };
+ };
+
+ // Alias wrapper allowing storage to be unioned
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ _TempStorage& temp_storage; ///< Reference to temp_storage
+ WrappedInputIteratorT d_in; ///< Input data
+ OutputIteratorT d_out; ///< Output data
+ ScanOpT scan_op; ///< Binary scan operator
+ InitValueT init_value; ///< The init_value element for ScanOpT
+
+
+ //---------------------------------------------------------------------
+ // Block scan utility methods
+ //---------------------------------------------------------------------
+
+ /**
+ * Exclusive scan specialization (first tile)
+ */
+ __device__ __forceinline__
+ void ScanTile(
+ OutputT (&items)[ITEMS_PER_THREAD],
+ OutputT init_value,
+ ScanOpT scan_op,
+ OutputT &block_aggregate,
+ Int2Type